@Override public boolean equals( Object object ) { if( this == object ) return true; if( object == null || getClass() != object.getClass() ) return false; GlobHfs globHfs = (GlobHfs) object; // do not compare tap arrays, these values should be sufficient to show identity if( getScheme() != null ? !getScheme().equals( globHfs.getScheme() ) : globHfs.getScheme() != null ) return false; if( pathFilter != null ? !pathFilter.equals( globHfs.pathFilter ) : globHfs.pathFilter != null ) return false; if( pathPattern != null ? !pathPattern.equals( globHfs.pathPattern ) : globHfs.pathPattern != null ) return false; return true; }
@Override protected Hfs[] getTaps() { return initTapsInternal( new JobConf() ); }
private Hfs[] initTapsInternal( Configuration conf ) { if( taps != null ) return taps; try { taps = makeTaps( conf ); } catch( IOException exception ) { throw new TapException( "unable to resolve taps for globing path: " + pathPattern ); } return taps; }
@Test public void testGlobHfs() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); GlobHfs source = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r],owe?}.txt" ); assertEquals( 2, source.getTaps().length ); // show globhfs will just match a directory if ended with a / assertEquals( 1, new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "../?ata/" ).getTaps().length ); Tap sink = new Hfs( new TextLine(), getOutputPath( "glob" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), "\\s" ); Pipe concatPipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter ); Flow concatFlow = getPlatform().getFlowConnector( getProperties() ).connect( "first", source, sink, concatPipe ); Tap nextSink = new Hfs( new TextLine(), getOutputPath( "glob2" ), SinkMode.REPLACE ); Flow nextFlow = getPlatform().getFlowConnector( getProperties() ).connect( "second", sink, nextSink, concatPipe ); Cascade cascade = new CascadeConnector( getProperties() ).connect( concatFlow, nextFlow ); cascade.complete(); validateLength( concatFlow, 10 ); }
@Test public void testMultiSourceIterator() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); GlobHfs source1 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r]}.txt" ); GlobHfs source2 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{owe?}.txt" ); MultiSourceTap source = new MultiSourceTap( source1, source2 ); validateLength( source.openForRead( getPlatform().getFlowProcess() ), 10 ); GlobHfs sourceMulti = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r],owe?}.txt" ); source = new MultiSourceTap( sourceMulti ); validateLength( source.openForRead( getPlatform().getFlowProcess() ), 10, null ); }
@Test public void testGlobHfs() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); GlobHfs source = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r],owe?}.txt" ); assertEquals( 2, source.getTaps().length ); // show globhfs will just match a directory if ended with a / assertEquals( 1, new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "../?ata/" ).getTaps().length ); Tap sink = new Hfs( new TextLine(), getOutputPath( "glob" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), "\\s" ); Pipe concatPipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter ); Flow concatFlow = getPlatform().getFlowConnector( getProperties() ).connect( "first", source, sink, concatPipe ); Tap nextSink = new Hfs( new TextLine(), getOutputPath( "glob2" ), SinkMode.REPLACE ); Flow nextFlow = getPlatform().getFlowConnector( getProperties() ).connect( "second", sink, nextSink, concatPipe ); Cascade cascade = new CascadeConnector( getProperties() ).connect( concatFlow, nextFlow ); cascade.complete(); validateLength( concatFlow, 10 ); }
@Test public void testMultiSourceIterator() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); GlobHfs source1 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r]}.txt" ); GlobHfs source2 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{owe?}.txt" ); MultiSourceTap source = new MultiSourceTap( source1, source2 ); validateLength( source.openForRead( getPlatform().getFlowProcess() ), 10 ); GlobHfs sourceMulti = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r],owe?}.txt" ); source = new MultiSourceTap( sourceMulti ); validateLength( source.openForRead( getPlatform().getFlowProcess() ), 10, null ); }
@Override protected Hfs[] getTaps() { return initTapsInternal( new JobConf() ); }
@Test public void testNestedMultiSourceGlobHfs() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); GlobHfs source1 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r]}.txt" ); GlobHfs source2 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{owe?}.txt" ); MultiSourceTap source = new MultiSourceTap( source1, source2 ); assertEquals( 2, source.getNumChildTaps() ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), getOutputPath( "globmultisource" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), "\\s" ); Pipe concatPipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter ); Flow concatFlow = getPlatform().getFlowConnector( getProperties() ).connect( "first", source, sink, concatPipe ); Tap nextSink = new Hfs( new TextLine(), getOutputPath( "globmultiource2" ), SinkMode.REPLACE ); Flow nextFlow = getPlatform().getFlowConnector( getProperties() ).connect( "second", sink, nextSink, concatPipe ); Cascade cascade = new CascadeConnector( getProperties() ).connect( concatFlow, nextFlow ); cascade.complete(); validateLength( concatFlow, 10 ); }
@Override public boolean equals( Object object ) { if( this == object ) return true; if( object == null || getClass() != object.getClass() ) return false; GlobHfs globHfs = (GlobHfs) object; // do not compare tap arrays, these values should be sufficient to show identity if( getScheme() != null ? !getScheme().equals( globHfs.getScheme() ) : globHfs.getScheme() != null ) return false; if( pathFilter != null ? !pathFilter.equals( globHfs.pathFilter ) : globHfs.pathFilter != null ) return false; if( pathPattern != null ? !pathPattern.equals( globHfs.pathPattern ) : globHfs.pathPattern != null ) return false; return true; }
private Hfs[] initTapsInternal( Configuration conf ) { if( taps != null ) return taps; try { taps = makeTaps( conf ); } catch( IOException exception ) { throw new TapException( "unable to resolve taps for globing path: " + pathPattern ); } return taps; }
@Override public void sourceConfInit( FlowProcess<? extends Configuration> process, Configuration conf ) { Hfs[] taps = initTapsInternal( conf ); taps[ 0 ].sourceConfInitAddInputPaths( conf, new LazyIterable<Hfs, Path>( taps ) { @Override protected Path convert( Hfs next ) { return next.getPath(); // we are building fully qualified paths above } } ); taps[ 0 ].sourceConfInitComplete( process, conf ); }
@Test public void testNestedMultiSourceGlobHfs() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); GlobHfs source1 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{ppe[_r]}.txt" ); GlobHfs source2 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "?{owe?}.txt" ); MultiSourceTap source = new MultiSourceTap( source1, source2 ); assertEquals( 2, source.getNumChildTaps() ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), getOutputPath( "globmultisource" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), "\\s" ); Pipe concatPipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter ); Flow concatFlow = getPlatform().getFlowConnector( getProperties() ).connect( "first", source, sink, concatPipe ); Tap nextSink = new Hfs( new TextLine(), getOutputPath( "globmultiource2" ), SinkMode.REPLACE ); Flow nextFlow = getPlatform().getFlowConnector( getProperties() ).connect( "second", sink, nextSink, concatPipe ); Cascade cascade = new CascadeConnector( getProperties() ).connect( concatFlow, nextFlow ); cascade.complete(); validateLength( concatFlow, 10 ); }
private Hfs[] makeTaps( Configuration conf ) throws IOException { FileStatus[] statusList; Path path = new Path( pathPattern ); FileSystem fileSystem = path.getFileSystem( conf ); if( pathFilter == null ) statusList = fileSystem.globStatus( path ); else statusList = fileSystem.globStatus( path, pathFilter ); if( statusList == null || statusList.length == 0 ) throw new TapException( "unable to find paths matching path pattern: " + pathPattern ); List<Hfs> notEmpty = new ArrayList<Hfs>(); for( int i = 0; i < statusList.length; i++ ) { // remove empty files. some hadoop versions return non-zero for dirs // so this jives with the expectations set in the above javadoc if( statusList[ i ].isDir() || statusList[ i ].getLen() != 0 ) notEmpty.add( new Hfs( getScheme(), statusList[ i ].getPath().toString() ) ); } if( notEmpty.isEmpty() ) throw new TapException( "all paths matching path pattern are zero length and not directories: " + pathPattern ); return notEmpty.toArray( new Hfs[ notEmpty.size() ] ); }
@Override public void sourceConfInit( FlowProcess<? extends Configuration> process, Configuration conf ) { Hfs[] taps = initTapsInternal( conf ); taps[ 0 ].sourceConfInitAddInputPaths( conf, new LazyIterable<Hfs, Path>( taps ) { @Override protected Path convert( Hfs next ) { return next.getPath(); // we are building fully qualified paths above } } ); taps[ 0 ].sourceConfInitComplete( process, conf ); }
private Hfs[] makeTaps( Configuration conf ) throws IOException { FileStatus[] statusList; Path path = new Path( pathPattern ); FileSystem fileSystem = path.getFileSystem( conf ); if( pathFilter == null ) statusList = fileSystem.globStatus( path ); else statusList = fileSystem.globStatus( path, pathFilter ); if( statusList == null || statusList.length == 0 ) throw new TapException( "unable to find paths matching path pattern: " + pathPattern ); List<Hfs> notEmpty = new ArrayList<Hfs>(); for( int i = 0; i < statusList.length; i++ ) { // remove empty files. some hadoop versions return non-zero for dirs // so this jives with the expectations set in the above javadoc if( statusList[ i ].isDir() || statusList[ i ].getLen() != 0 ) notEmpty.add( new Hfs( getScheme(), statusList[ i ].getPath().toString() ) ); } if( notEmpty.isEmpty() ) throw new TapException( "all paths matching path pattern are zero length and not directories: " + pathPattern ); return notEmpty.toArray( new Hfs[ notEmpty.size() ] ); }