@Test public void testTemporarySinkPathIsDeleted() throws Exception { getPlatform().copyFromLocal( inputFileLowerOffset ); Tap source = getPlatform().getDelimitedFile( new Fields( "a", "b" ), " ", inputFileLowerOffset ); Pipe pipe = new Pipe( "test" ); String outputPath = getOutputPath( "partition-tap-sink" ); Tap sink = getPlatform().getDelimitedFile( new Fields( "a" ), " ", outputPath ); sink = getPlatform().getPartitionTap( sink, new DelimitedPartition( new Fields( "b" ) ), 1 ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); Path tempPath = new Path( outputPath, Hadoop18TapUtil.TEMPORARY_PATH ); FileSystem fileSystem = tempPath.getFileSystem( (Configuration) flow.getConfigCopy() ); assertFalse( fileSystem.exists( tempPath ) ); }
@Test public void testTemporarySinkPathIsDeleted() throws Exception { getPlatform().copyFromLocal( inputFileLowerOffset ); Tap source = getPlatform().getDelimitedFile( new Fields( "a", "b" ), " ", inputFileLowerOffset ); Pipe pipe = new Pipe( "test" ); String outputPath = getOutputPath( "partition-tap-sink" ); Tap sink = getPlatform().getDelimitedFile( new Fields( "a" ), " ", outputPath ); sink = getPlatform().getPartitionTap( sink, new DelimitedPartition( new Fields( "b" ) ), 1 ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); Path tempPath = new Path( outputPath, Hadoop18TapUtil.TEMPORARY_PATH ); FileSystem fileSystem = tempPath.getFileSystem( (Configuration) flow.getConfigCopy() ); assertFalse( fileSystem.exists( tempPath ) ); }
@Test public void testResolvedSinkFields() throws IOException { getPlatform().copyFromLocal( inputFileLower ); Tap source = new Hfs( new TextLine( new Fields( "line" ) ), inputFileLower ); Pipe pipe = new Pipe( "test" ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); pipe = new Each( pipe, new Fields( "line" ), splitter ); Tap sink = new Hfs( new ResolvedScheme( new Fields( "num", "char" ) ), getOutputPath( "resolvedfields" ), SinkMode.REPLACE ); Flow flow = getPlatform().getFlowConnector( getProperties() ).connect( source, sink, pipe ); flow.complete(); List<Tuple> tuples = asList( flow, sink ); List<Object> values = new ArrayList<Object>(); for( Tuple tuple : tuples ) values.add( tuple.getObject( 1 ) ); assertTrue( values.contains( "1\ta" ) ); assertTrue( values.contains( "2\tb" ) ); assertTrue( values.contains( "3\tc" ) ); assertTrue( values.contains( "4\td" ) ); assertTrue( values.contains( "5\te" ) ); assertEquals( 5, tuples.size() ); // confirm the tuple iterator can handle nulls from the source assertEquals( 5, asList( flow, source ).size() ); }
@Test public void testResolvedSinkFields() throws IOException { getPlatform().copyFromLocal( inputFileLower ); Tap source = new Hfs( new TextLine( new Fields( "line" ) ), inputFileLower ); Pipe pipe = new Pipe( "test" ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); pipe = new Each( pipe, new Fields( "line" ), splitter ); Tap sink = new Hfs( new ResolvedScheme( new Fields( "num", "char" ) ), getOutputPath( "resolvedfields" ), SinkMode.REPLACE ); Flow flow = getPlatform().getFlowConnector( getProperties() ).connect( source, sink, pipe ); flow.complete(); List<Tuple> tuples = asList( flow, sink ); List<Object> values = new ArrayList<Object>(); for( Tuple tuple : tuples ) values.add( tuple.getObject( 1 ) ); assertTrue( values.contains( "1\ta" ) ); assertTrue( values.contains( "2\tb" ) ); assertTrue( values.contains( "3\tc" ) ); assertTrue( values.contains( "4\td" ) ); assertTrue( values.contains( "5\te" ) ); assertEquals( 5, tuples.size() ); // confirm the tuple iterator can handle nulls from the source assertEquals( 5, asList( flow, source ).size() ); }
private void writeFileTo( String path ) throws IOException { Hfs tap = new Hfs( new TextLine( new Fields( "offset", "line" ) ), getOutputPath( path ) ); TupleEntryCollector collector = tap.openForWrite( getPlatform().getFlowProcess() ); collector.add( new Tuple( 1, "1" ) ); collector.close(); }
private void writeFileTo( String path ) throws IOException { Hfs tap = new Hfs( new TextLine( new Fields( "offset", "line" ) ), getOutputPath( path ) ); TupleEntryCollector collector = tap.openForWrite( getPlatform().getFlowProcess() ); collector.add( new Tuple( 1, "1" ) ); collector.close(); }
@Test public void testDupeConfigFromScheme() throws IOException { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); Tap sourceUpper = getPlatform().getTap( new DupeConfigScheme( new Fields( "offset", "line" ) ), inputFileUpper, SinkMode.KEEP ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "dupeconfig" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); // by default the source is decorated with a DistCacheTap which uses a Lfs tap to read the local file, so it is safe // to call #sourceConfInit a second time client side as we are leveraging a new tap instance Properties properties = flowConnectorProps() .setEnableDecorateAccumulatedTap( false ) .buildProperties( getProperties() ); Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); flow.complete(); validateLength( flow, 5 ); List<Tuple> values = getSinkAsList( flow ); assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); }
@Test public void testDupeConfigFromScheme() throws IOException { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); Tap sourceUpper = getPlatform().getTap( new DupeConfigScheme( new Fields( "offset", "line" ) ), inputFileUpper, SinkMode.KEEP ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "dupeconfig" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); // by default the source is decorated with a DistCacheTap which uses a Lfs tap to read the local file, so it is safe // to call #sourceConfInit a second time client side as we are leveraging a new tap instance Properties properties = flowConnectorProps() .setEnableDecorateAccumulatedTap( false ) .buildProperties( getProperties() ); Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); flow.complete(); validateLength( flow, 5 ); List<Tuple> values = getSinkAsList( flow ); assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); }
@Test public void testGlobHfs() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); GlobHfs source = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r],owe?}.txt" ); assertEquals( 2, source.getTaps().length ); // show globhfs will just match a directory if ended with a / assertEquals( 1, new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "../?ata/" ).getTaps().length ); Tap sink = new Hfs( new TextLine(), getOutputPath( "glob" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), "\\s" ); Pipe concatPipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter ); Flow concatFlow = getPlatform().getFlowConnector( getProperties() ).connect( "first", source, sink, concatPipe ); Tap nextSink = new Hfs( new TextLine(), getOutputPath( "glob2" ), SinkMode.REPLACE ); Flow nextFlow = getPlatform().getFlowConnector( getProperties() ).connect( "second", sink, nextSink, concatPipe ); Cascade cascade = new CascadeConnector( getProperties() ).connect( concatFlow, nextFlow ); cascade.complete(); validateLength( concatFlow, 10 ); }
@Test public void testNestedMultiSourceGlobHfs() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); GlobHfs source1 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r]}.txt" ); GlobHfs source2 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{owe?}.txt" ); MultiSourceTap source = new MultiSourceTap( source1, source2 ); assertEquals( 2, source.getNumChildTaps() ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), getOutputPath( "globmultisource" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), "\\s" ); Pipe concatPipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter ); Flow concatFlow = getPlatform().getFlowConnector( getProperties() ).connect( "first", source, sink, concatPipe ); Tap nextSink = new Hfs( new TextLine(), getOutputPath( "globmultiource2" ), SinkMode.REPLACE ); Flow nextFlow = getPlatform().getFlowConnector( getProperties() ).connect( "second", sink, nextSink, concatPipe ); Cascade cascade = new CascadeConnector( getProperties() ).connect( concatFlow, nextFlow ); cascade.complete(); validateLength( concatFlow, 10 ); }
@Test public void testNestedMultiSourceGlobHfs() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); GlobHfs source1 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r]}.txt" ); GlobHfs source2 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{owe?}.txt" ); MultiSourceTap source = new MultiSourceTap( source1, source2 ); assertEquals( 2, source.getNumChildTaps() ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), getOutputPath( "globmultisource" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), "\\s" ); Pipe concatPipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter ); Flow concatFlow = getPlatform().getFlowConnector( getProperties() ).connect( "first", source, sink, concatPipe ); Tap nextSink = new Hfs( new TextLine(), getOutputPath( "globmultiource2" ), SinkMode.REPLACE ); Flow nextFlow = getPlatform().getFlowConnector( getProperties() ).connect( "second", sink, nextSink, concatPipe ); Cascade cascade = new CascadeConnector( getProperties() ).connect( concatFlow, nextFlow ); cascade.complete(); validateLength( concatFlow, 10 ); }
@Test public void testGlobHfs() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); GlobHfs source = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r],owe?}.txt" ); assertEquals( 2, source.getTaps().length ); // show globhfs will just match a directory if ended with a / assertEquals( 1, new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "../?ata/" ).getTaps().length ); Tap sink = new Hfs( new TextLine(), getOutputPath( "glob" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), "\\s" ); Pipe concatPipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter ); Flow concatFlow = getPlatform().getFlowConnector( getProperties() ).connect( "first", source, sink, concatPipe ); Tap nextSink = new Hfs( new TextLine(), getOutputPath( "glob2" ), SinkMode.REPLACE ); Flow nextFlow = getPlatform().getFlowConnector( getProperties() ).connect( "second", sink, nextSink, concatPipe ); Cascade cascade = new CascadeConnector( getProperties() ).connect( concatFlow, nextFlow ); cascade.complete(); validateLength( concatFlow, 10 ); }
@Test public void testNullsFromScheme() throws IOException { getPlatform().copyFromLocal( inputFileComments ); Tap source = new Hfs( new CommentScheme( new Fields( "line" ) ), inputFileComments ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Identity() ); Tap sink = new Hfs( new TextLine( 1 ), getOutputPath( "testnulls" ), SinkMode.REPLACE ); Flow flow = getPlatform().getFlowConnector( getProperties() ).connect( source, sink, pipe ); flow.complete(); validateLength( flow, 5, null ); TupleEntryIterator iterator = flow.openSink(); assertEquals( "not equal: tuple.get(1)", "1 a", iterator.next().getObject( 1 ) ); iterator.close(); // confirm the tuple iterator can handle nulls from the source validateLength( flow.openSource(), 5 ); }
@Test public void testNullsFromScheme() throws IOException { getPlatform().copyFromLocal( inputFileComments ); Tap source = new Hfs( new CommentScheme( new Fields( "line" ) ), inputFileComments ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Identity() ); Tap sink = new Hfs( new TextLine( 1 ), getOutputPath( "testnulls" ), SinkMode.REPLACE ); Flow flow = getPlatform().getFlowConnector( getProperties() ).connect( source, sink, pipe ); flow.complete(); validateLength( flow, 5, null ); TupleEntryIterator iterator = flow.openSink(); assertEquals( "not equal: tuple.get(1)", "1 a", iterator.next().getObject( 1 ) ); iterator.close(); // confirm the tuple iterator can handle nulls from the source validateLength( flow.openSource(), 5 ); }
@Test public void testCommitResource() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); final int[] count = {0}; Tap sink = new Hfs( new TextDelimited( Fields.ALL ), getOutputPath( "committap" ), SinkMode.REPLACE ) { @Override public boolean commitResource( Configuration conf ) throws IOException { count[ 0 ] = count[ 0 ] + 1; return true; } }; Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); assertEquals( 1, count[ 0 ] ); validateLength( flow, 8, null ); }
Tap sink = new Hfs( new TextDelimited( Fields.ALL ), getOutputPath( "preparewritetapfail" ), SinkMode.REPLACE )
@Test public void testCommitResource() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); final int[] count = {0}; Tap sink = new Hfs( new TextDelimited( Fields.ALL ), getOutputPath( "committap" ), SinkMode.REPLACE ) { @Override public boolean commitResource( Configuration conf ) throws IOException { count[ 0 ] = count[ 0 ] + 1; return true; } }; Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); assertEquals( 1, count[ 0 ] ); validateLength( flow, 8, null ); }
@Test public void testMissingInputFormat() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = new Hfs( new TextDelimited( new Fields( "offset", "line" ) ), inputFileApache ) { @Override public void sourceConfInit( FlowProcess<? extends Configuration> process, Configuration conf ) { // don't set input format //super.sourceConfInit( process, conf ); } }; Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); Tap sink = new Hfs( new TextDelimited( Fields.ALL ), getOutputPath( "missinginputformat" ), SinkMode.REPLACE ); try { Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); fail( "did not test for missing input format" ); } catch( Exception exception ) { // ignore } }
Tap sink = new Hfs( new TextDelimited( Fields.ALL ), getOutputPath( "preparereadtapfail" ), SinkMode.REPLACE );
@Test public void testMissingInputFormat() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = new Hfs( new TextDelimited( new Fields( "offset", "line" ) ), inputFileApache ) { @Override public void sourceConfInit( FlowProcess<? extends Configuration> process, Configuration conf ) { // don't set input format //super.sourceConfInit( process, conf ); } }; Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); Tap sink = new Hfs( new TextDelimited( Fields.ALL ), getOutputPath( "missinginputformat" ), SinkMode.REPLACE ); try { Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); fail( "did not test for missing input format" ); } catch( Exception exception ) { // ignore } }