cascading.scheme.Scheme java code examples

public void sinkCleanup(FlowProcess<JobConf> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
  super.sinkCleanup(flowProcess, sinkCall);
  sinkCall.setContext(null);
}

@Override
public void sinkPrepare(FlowProcess<JobConf> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
  super.sinkPrepare(flowProcess, sinkCall);
  Object[] context = new Object[SINK_CTX_SIZE];
  // the tuple is fixed, so we can just use a collection/index
  Settings settings = loadSettings(flowProcess.getConfigCopy(), false);
  context[SINK_CTX_ALIASES] = CascadingUtils.fieldToAlias(settings, getSinkFields());
  sinkCall.setContext(context);
}

@Override
public void sourceCleanup(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall) throws IOException {
  super.sourceCleanup(flowProcess, sourceCall);
  sourceCall.setContext(null);
}

public int getNumSinkParts()
 {
 return scheme.getNumSinkParts();
 }

@Override
public boolean equals(Object object) {
 if (this == object) { return true; }
 if (object == null || getClass() != object.getClass()) { return false; }
 if (!super.equals(object)) { return false; }
 HBaseScheme that = (HBaseScheme) object;
 if (!Arrays.equals(familyNames, that.familyNames)) { return false; }
 if (keyField != null ? !keyField.equals(that.keyField) : that.keyField != null) {
  return false;
 }
 if (!Arrays.equals(valueFields, that.valueFields)) { return false; }
 return true;
}

private void setFields( Scheme scheme )
 {
 if( scheme.isSource() )
  {
  Fields sourceFields = normalize( scheme.getSourceFields() );
  if( fields == null )
   fields = sourceFields;
  else if( !fields.equals( sourceFields ) )
   throw new IllegalArgumentException( "all schemes added to stereotype must have the same source fields, expected: " + fields + ", received: " + sourceFields + " in stereotype: " + getName() );
  }
 if( scheme.isSink() )
  {
  Fields sinkFields = normalize( scheme.getSinkFields() );
  if( fields == null )
   fields = sinkFields;
  else if( !fields.equals( sinkFields ) )
   throw new IllegalArgumentException( "all schemes added to stereotype must have the same sink fields, expected: " + fields + ", received: " + sinkFields + " in stereotype: " + getName() );
  }
 }

public LocalScheme(Scheme<JobConf, RecordReader, OutputCollector, SourceContext, SinkContext> scheme) {
  super(scheme.getSourceFields(), scheme.getSinkFields());
  this.scheme = scheme;
}

protected void verifyCheckpoints( FlowDef flowDef, Pipe[] flowTails )
 {
 verifyNotSourcesSinks( flowDef.getCheckpoints(), flowDef.getSources(), flowDef.getSinks(), "checkpoint" );
 for( Tap checkpointTap : flowDef.getCheckpoints().values() )
  {
  Scheme scheme = checkpointTap.getScheme();
  if( scheme.getSourceFields().equals( Fields.UNKNOWN ) && scheme.getSinkFields().equals( Fields.ALL ) )
   continue;
  throw new PlannerException( "checkpoint tap scheme must be undeclared, source fields must be UNKNOWN, and sink fields ALL, got: " + scheme.toString() );
  }
 Set<String> names = new HashSet<String>( asList( Pipe.names( flowTails ) ) );
 for( String name : flowDef.getCheckpoints().keySet() )
  {
  if( !names.contains( name ) )
   throw new PlannerException( "named checkpoint declared in FlowDef, but no named branch found in pipe assembly: '" + name + "'" );
  Set<Pipe> pipes = new HashSet<Pipe>( asList( Pipe.named( name, flowTails ) ) );
  int count = 0;
  for( Pipe pipe : pipes )
   {
   if( pipe instanceof Checkpoint )
    count++;
   }
  if( count == 0 )
   throw new PlannerException( "no checkpoint pipe with branch name found in pipe assembly: '" + name + "'" );
  if( count > 1 )
   throw new PlannerException( "more than one checkpoint pipe with branch name found in pipe assembly: '" + name + "'" );
  }
 }

public TupleEntrySchemeIterator( FlowProcess<? extends Config> flowProcess, Tap tap, Scheme scheme, CloseableIterator<Input> inputIterator, Supplier<String> loggableIdentifier )
 super( scheme.getSourceFields() );
 this.flowProcess = flowProcess;
 this.scheme = scheme;
  this.scheme.sourcePrepare( flowProcess, sourceCall );

 @Override
 public int hashCode() {
  int result = super.hashCode();
  result = 31 * result + (keyField != null ? keyField.hashCode() : 0);
  result = 31 * result + (familyNames != null ? Arrays.hashCode(familyNames) : 0);
  result = 31 * result + (valueFields != null ? Arrays.hashCode(valueFields) : 0);
  return result;
 }
}

/**
 * Method retrieveSinkFields notifies a Scheme when it is appropriate to dynamically
 * update the fields it sources. By default the current declared fields are returned.
 * <p>
 * The {@code FlowProcess} presents all known properties resolved by the current planner.
 * <p>
 * The {@code tap} instance is the parent {@link Tap} for this Scheme instance.
 *
 * @param flowProcess of type FlowProcess
 * @param tap         of type Tap
 * @return Fields
 */
public Fields retrieveSinkFields( FlowProcess<? extends Config> flowProcess, Tap tap )
 {
 return getSinkFields();
 }

protected void presentSourceFieldsInternal( Fields fields )
 {
 if( getSourceFields().equals( Fields.UNKNOWN ) )
  setSourceFields( fields );
 }

protected void presentSinkFieldsInternal( Fields fields )
 {
 if( getSinkFields().equals( Fields.ALL ) )
  setSinkFields( fields );
 }

/**
 * Method retrieveSourceFields notifies a Scheme when it is appropriate to dynamically
 * update the fields it sources. By default the current declared fields are returned.
 * <p>
 * The {@code FlowProcess} presents all known properties resolved by the current planner.
 * <p>
 * The {@code tap} instance is the parent {@link Tap} for this Scheme instance.
 *
 * @param flowProcess of type FlowProcess
 * @param tap         of type Tap
 * @return Fields
 */
public Fields retrieveSourceFields( FlowProcess<? extends Config> flowProcess, Tap tap )
 {
 return getSourceFields();
 }

@Override
public void presentSinkFields(FlowProcess<Properties> flowProcess, 
    Tap tap, Fields fields) {
  scheme.presentSinkFields(new HadoopFlowProcess(defaults), lfs, fields);
}

public void setNumSinkParts( int numSinkParts )
 {
 scheme.setNumSinkParts( numSinkParts );
 }

protected int getParallelism( FlowNode flowNode, JobConf conf )
 {
 // only count streamed taps, accumulated taps are always annotated
 HashSet<Tap> sourceStreamedTaps = new HashSet<>( flowNode.getSourceTaps() );
 sourceStreamedTaps.removeAll( flowNode.getSourceElements( StreamMode.Accumulated ) );
 if( sourceStreamedTaps.size() != 0 )
  return -1;
 int parallelism = Integer.MAX_VALUE;
 for( Tap tap : flowNode.getSinkTaps() )
  {
  int numSinkParts = tap.getScheme().getNumSinkParts();
  if( numSinkParts == 0 )
   continue;
  if( parallelism != Integer.MAX_VALUE )
   LOG.info( "multiple sink taps in flow node declaring numSinkParts, choosing lowest value. see cascading.flow.FlowRuntimeProps for broader control." );
  parallelism = Math.min( parallelism, numSinkParts );
  }
 if( parallelism != Integer.MAX_VALUE )
  return parallelism;
 return conf.getInt( FlowRuntimeProps.GATHER_PARTITIONS, 0 );
 }

private void verifyTaps()
 {
 Tap tap = taps[ 0 ];
 for( int i = 1; i < taps.length; i++ )
  {
  if( tap.getClass() != taps[ i ].getClass() )
   throw new TapException( "all taps must be of the same type" );
  if( !tap.getScheme().equals( taps[ i ].getScheme() ) )
   throw new TapException( "all tap schemes must be equivalent" );
  }
 }

/**
 * Method isSymmetrical returns {@code true} if the sink fields equal the source fields. That is, this
 * scheme sources the same fields as it sinks.
 *
 * @return the symmetrical (type boolean) of this Scheme object.
 */
public boolean isSymmetrical()
 {
 return getSourceFields().equals( Fields.UNKNOWN ) && getSinkFields().equals( Fields.ALL ) || getSinkFields().equals( getSourceFields() );
 }

@Override
public int hashCode()
 {
 int result = super.hashCode();
 result = 31 * result + ( inputFormatClass != null ? inputFormatClass.hashCode() : 0 );
 result = 31 * result + ( outputFormatClass != null ? outputFormatClass.hashCode() : 0 );
 result = 31 * result + ( columns != null ? Arrays.hashCode( columns ) : 0 );
 result = 31 * result + ( orderBy != null ? Arrays.hashCode( orderBy ) : 0 );
 result = 31 * result + ( conditions != null ? conditions.hashCode() : 0 );
 result = 31 * result + ( updateBy != null ? Arrays.hashCode( updateBy ) : 0 );
 result = 31 * result + ( updateValueFields != null ? updateValueFields.hashCode() : 0 );
 result = 31 * result + ( updateByFields != null ? updateByFields.hashCode() : 0 );
 result = 31 * result + ( columnFields != null ? columnFields.hashCode() : 0 );
 result = 31 * result + ( updateIfTuple != null ? updateIfTuple.hashCode() : 0 );
 result = 31 * result + ( selectQuery != null ? selectQuery.hashCode() : 0 );
 result = 31 * result + ( countQuery != null ? countQuery.hashCode() : 0 );
 result = 31 * result + (int) ( limit ^ ( limit >>> 32 ) );
 return result;
 }
}

Javadoc

A Scheme defines what is stored in a Tap instance by declaring the Tuplefield names, and alternately parsing or rendering the incoming or outgoing Tuplestream, respectively.

A Scheme defines the type of resource data will be sourced from or sinked to.

The default sourceFields are Fields#UNKNOWN and the default sinkFields are Fields#ALL.

Any given sourceFields only label the values in the Tuples as they are sourced. It does not necessarily filter the output since a given implementation may choose to collapse values and ignore keys depending on the format.

If the sinkFields are Fields#ALL, the Cascading planner will attempt to resolve the actual field names and make them available via the cascading.scheme.SinkCall#getOutgoingEntry() method. Sometimes this may not be possible (in the case the Tap#openForWrite(cascading.flow.FlowProcess) method is called from user code directly (without planner intervention).

If the sinkFields are a valid selector, the #sink(cascading.flow.FlowProcess,SinkCall) method will only see the fields expected.

Setting the numSinkParts value to 1 (one) attempts to ensure the output resource has only one part. In the case of MapReduce, this is only a suggestion for the Map side, on the Reduce side it does this by setting the number of reducers to the given value. This may affect performance, so be cautioned.

Note that setting numSinkParts does not force the planner to insert a final Reduce operation in the job, so numSinkParts may be ignored entirely if the final job is Map only. To force the Flow to have a final Reduce, add a cascading.pipe.GroupBy to the assembly before sinking.

Most used methods

Popular in Java

Updating database using SQL prepared statement
scheduleAtFixedRate (ScheduledExecutorService)
scheduleAtFixedRate (Timer)
getSharedPreferences (Context)
PrintWriter (java.io)
Wraps either an existing OutputStream or an existing Writerand provides convenience methods for prin
ServerSocket (java.net)
This class represents a server-side socket that waits for incoming client connections. A ServerSocke
Charset (java.nio.charset)
A charset is a named mapping between Unicode characters and byte sequences. Every Charset can decode
Executor (java.util.concurrent)
An object that executes submitted Runnable tasks. This interface provides a way of decoupling task s
Options (org.apache.commons.cli)
Main entry-point into the library. Options represents a collection of Option objects, which describ
Join (org.hibernate.mapping)
Top plugins for Android Studio

How to useScheme in cascading.scheme

Best Java code snippets using cascading.scheme.Scheme (Showing top 20 results out of 315)

How to use
Scheme
in
cascading.scheme