@Override protected TupleEntrySchemeIterator createTupleEntrySchemeIterator( FlowProcess<? extends Configuration> flowProcess, Tap parent, String path, RecordReader recordReader ) throws IOException { return new HadoopTupleEntrySchemeIterator( flowProcess, new Hfs( parent.getScheme(), path ), recordReader ); }
public boolean testExists( Flow flow, Hfs tap ) { try { // don't test for _SUCCESS if the tap is a file, only if a directory if( !tap.isDirectory( flow.getFlowProcess() ) ) return true; return new Hfs( new TextLine(), new Path( tap.getPath(), "_SUCCESS" ).toString() ).resourceExists( flow.getFlowProcess() ); } catch( IOException exception ) { throw new FlowException( exception ); } }
public URI getDefaultURIScheme( Tap tap ) { return ( (Hfs) tap ).getDefaultFileSystemURIScheme( defaultConfiguration ); }
@Override public TapWith<Configuration, RecordReader, OutputCollector> withScheme( Scheme<Configuration, RecordReader, OutputCollector, ?, ?> scheme ) { return create( scheme, getPath(), getSinkMode() ); }
/** * Given a file-system object, it makes an array of paths * * @param conf of type JobConf * @throws IOException on failure */ private void makeStatuses( Configuration conf ) throws IOException { if( statuses != null ) return; statuses = getFileSystem( conf ).listStatus( getPath() ); }
Hfs approxCountsTap = new Hfs(new SequenceFile(new Fields("bytes")), partsDir); TupleEntryIterator in = approxCountsTap.openForRead(CascadingUtil.get().getFlowProcess()); List<HyperLogLog> countParts = new LinkedList<HyperLogLog>();
@Test public void testHfsAsterisk() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Hfs sourceExists = new Hfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "*" ); assertTrue( sourceExists.resourceExists( getPlatform().getFlowProcess() ) ); TupleEntryIterator iterator = sourceExists.openForRead( getPlatform().getFlowProcess() ); assertTrue( iterator.hasNext() ); iterator.close(); try { Hfs sourceNotExists = new Hfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "/blah/" ); iterator = sourceNotExists.openForRead( getPlatform().getFlowProcess() ); fail(); } catch( IOException exception ) { // do nothing } }
public static String writeStateToDistCache( JobConf conf, String id, String kind, String stepState ) { if( Util.isEmpty( stepState ) ) return null; LOG.info( "writing step state to dist cache, too large for job conf, size: {}", stepState.length() ); String statePath = Hfs.getTempPath( conf ) + "/" + kind + "-state-" + id; Hfs temp = new Hfs( new TextLine(), statePath, SinkMode.REPLACE ); try { TupleEntryCollector writer = temp.openForWrite( new HadoopFlowProcess( conf ) ); writer.add( new Tuple( stepState ) ); writer.close(); } catch( IOException exception ) { throw new FlowException( "unable to write step state to Hadoop FS: " + temp.getIdentifier() ); } URI uri = new Path( statePath ).toUri(); DistributedCache.addCacheFile( uri, conf ); LOG.info( "using step state path: {}", uri ); return statePath; }
@Override public void cleanup(FlowProcess flowProcess, OperationCall operationCall) { JobConf conf = (JobConf) flowProcess.getConfigCopy(); try { LOG.info("HLL counter found " + approxCounter.cardinality() + " distinct keys"); Hfs tap = new Hfs(new SequenceFile(new Fields("bytes")), BloomProps.getApproxCountsDir(conf)); TupleEntryCollector out = tap.openForWrite(new HadoopFlowProcess(conf)); out.add(new Tuple(new BytesWritable(approxCounter.getBytes()))); out.close(); } catch (IOException e) { throw new RuntimeException("couldn't write approximate counts to side bucket", e); } }
@Override public long getSize( Configuration conf ) throws IOException { if( !resourceExists( conf ) ) return 0; FileStatus fileStatus = getFileStatus( conf ); if( fileStatus.isDir() ) return 0; return getFileSystem( conf ).getFileStatus( getPath() ).getLen(); }
@Override protected Path convert( Hfs next ) { return next.getPath(); // we are building fully qualified paths above } } );
public static boolean removeStateFromDistCache( Configuration conf, String path ) throws IOException { return new Hfs( new TextLine(), path ).deleteResource( conf ); }
@Override public boolean isDirectory( Configuration conf ) throws IOException { if( !resourceExists( conf ) ) return false; return getFileSystem( conf ).getFileStatus( getPath() ).isDir(); }
@Override public TapWith<Configuration, RecordReader, OutputCollector> withChildIdentifier( String identifier ) { Path path = new Path( identifier ); if( !path.toString().startsWith( getPath().toString() ) ) path = new Path( getPath(), path ); return create( getScheme(), path, getSinkMode() ); }
protected URI makeURIScheme( Configuration configuration ) { try { URI uriScheme; LOG.debug( "handling path: {}", stringPath ); URI uri = new Path( stringPath ).toUri(); // safer URI parsing String schemeString = uri.getScheme(); String authority = uri.getAuthority(); LOG.debug( "found scheme: {}, authority: {}", schemeString, authority ); if( schemeString != null && authority != null ) uriScheme = new URI( schemeString + "://" + uri.getAuthority() ); else if( schemeString != null ) uriScheme = new URI( schemeString + ":///" ); else uriScheme = getDefaultFileSystemURIScheme( configuration ); LOG.debug( "using uri scheme: {}", uriScheme ); return uriScheme; } catch( URISyntaxException exception ) { throw new TapException( "could not determine scheme from path: " + getPath(), exception ); } }
@Override public boolean createResource( Configuration conf ) throws IOException { if( LOG.isDebugEnabled() ) LOG.debug( "making dirs: {}", getFullIdentifier( conf ) ); return getFileSystem( conf ).mkdirs( getPath() ); }
@Override public String[] getChildIdentifiers( Configuration conf, int depth, boolean fullyQualified ) throws IOException { if( !resourceExists( conf ) ) return new String[ 0 ]; if( depth == 0 && !fullyQualified ) return new String[]{getIdentifier()}; String fullIdentifier = getFullIdentifier( conf ); int trim = fullyQualified ? 0 : fullIdentifier.length() + 1; Set<String> results = new LinkedHashSet<String>(); getChildPaths( conf, results, trim, new Path( fullIdentifier ), depth ); return results.toArray( new String[ results.size() ] ); }
/** * Method getBlockSize returns the {@code blocksize} specified by the underlying file system for this resource. * * @param conf of JobConf * @return long * @throws IOException when */ public long getBlockSize( Configuration conf ) throws IOException { if( !resourceExists( conf ) ) return 0; FileStatus fileStatus = getFileStatus( conf ); if( fileStatus.isDir() ) return 0; return fileStatus.getBlockSize(); }
reader = temp.openForRead( new HadoopFlowProcess( jobConf ) ); throw new FlowException( "step state path is empty: " + temp.getIdentifier() ); throw new FlowException( "unable to find state path: " + temp.getIdentifier(), exception );