water.MRTask java code examples

public static Vec[] makeWeights(Vec responseVec, final double trainRatio, final double[] weightMult) {
 Vec[] weights = new Vec[2];
 weights[0] = responseVec.makeZero();
 final long seed = water.util.RandomUtils.getRNG(new Random().nextLong()).nextLong();
 new MRTask() {
  @Override public void map(Chunk[] c){
   long start = c[0].start();
   Random rng = new water.util.RandomUtils.PCGRNG(start,1);
   int yval;
   for(int i = 0; i < c[0]._len; ++i) {
    yval=(int)c[0].at8(i);
    rng.setSeed(seed+start+i); // Determinstic per-row
    c[1].set(i, rng.nextFloat() < trainRatio ? (weightMult==null?1:weightMult[yval]) : 0);
   }
  }
 }.doAll(responseVec,weights[0]);
 if( null!=weightMult ) {
  weights[1] = new MRTask() {
   @Override public void map(Chunk[] cs, NewChunk n) {
    for(int i=0;i<cs[0]._len;++i) {
     if (0 == cs[1].at8(i))
      n.addNum(weightMult[(int) cs[0].at8(i)]);
     else
      n.addNum(0);
    }
   }
  }.doAll(Vec.T_NUM, new Frame(responseVec,weights[0])).outputFrame().anyVec();
 }
 return weights;
}

public Frame transform(Vec wordVec, AggregateMethod aggregateMethod) {
 if (wordVec.get_type() != Vec.T_STR) {
  throw new IllegalArgumentException("Expected a string vector, got " + wordVec.get_type_str() + " vector.");
 }
 byte[] types = new byte[_output._vecSize];
 Arrays.fill(types, Vec.T_NUM);
 MRTask<?> transformTask = aggregateMethod == AggregateMethod.AVERAGE ?
     new Word2VecAggregateTask(this) : new Word2VecTransformTask(this);
 return transformTask.doAll(types, wordVec).outputFrame(Key.<Frame>make(), null, null);
}

@Override
public void reduce(GLMCoordinateDescentTaskSeqIntercept git){
 _temp+= git._temp;
 super.reduce(git);
}

/**
 * Do the local computation: Perform one DeepLearningTask (with run_local=true) iteration.
 * Pass over all the data (will be replicated in dfork() here), and use _sync_fraction random rows.
 * This calls DeepLearningTask's reduce() between worker threads that update the same local model_info via Hogwild!
 * Once the computation is done, reduce() will be called
 */
@Override
public void setupLocal() {
 super.setupLocal();
 _res = new DeepLearningTask(_jobKey, _sharedmodel, _sync_fraction, _iteration, this);
 addToPendingCount(1);
 _res.dfork(null, _fr, true /*run_local*/);
}

/** Simple testing scenario, splitting frame in the middle and comparing the values */
static void testScenario(Frame f, String[] expValues, MRTask chunkAssertions) {
 double[] ratios = ard(0.5, 0.5);
 Key<Frame>[] keys = aro(Key.<Frame>make("test.hex"), Key.<Frame>make("train.hex"));
 Frame[] splits = null;
 try {
  splits = ShuffleSplitFrame.shuffleSplitFrame(f, keys, ratios, 42);
  Assert.assertEquals("Expecting 2 splits", 2, splits.length);
  // Collect values from both splits
  String[] values = append(
      collectS(splits[0].vec(0)),
      collectS(splits[1].vec(0)));
  // Sort values, but first replace all nulls by unique value
  Arrays.sort(replaceNulls(expValues));
  Arrays.sort(replaceNulls(values));
  Assert.assertArrayEquals("Values should match", expValues, values);
  if (chunkAssertions != null) {
   for (Frame s: splits) chunkAssertions.doAll(s).getResult();
  }
 } finally {
  f.delete();
  if (splits!=null) for(Frame s: splits) s.delete();
 }
}

 /**
  * Finish up the work after all nodes have reduced their models via the above reduce() method.
  * All we do is average the models and add to the global training sample counter.
  * After this returns, model_info() can be queried for the updated model.
  */
 @Override
 protected void postGlobal() {
  assert(_res.model_info().get_params()._replicate_training_data);
  super.postGlobal();
  // model averaging (DeepWaterTask only computed the per-node models, each on all the data)
//    _res.model_info().div(_res._chunk_node_count);
  _res.model_info().add_processed_global(_res.model_info().get_processed_local()); //switch from local counters to global counters
  _res.model_info().set_processed_local(0L);
  _sharedmodel = _res.model_info();
 }
}

for( int i=0; i<ratios.length; i++ ) {
 Vec[] nvecs = new Vec[ncols];
 final int rowLayout = mr.appendables()[i*ncols].compute_rowLayout();
 for( int c=0; c<ncols; c++ ) {
  AppendableVec av = mr.appendables()[i*ncols + c];
  av.setDomain(vecs[c].domain());
  nvecs[c] = av.close(rowLayout,fs);

/**
 * Do the local computation: Perform one DeepWaterTask (with run_local=true) iteration.
 * Pass over all the data (will be replicated in dfork() here), and use _sync_fraction random rows.
 * This calls DeepWaterTask's reduce() between worker threads that update the same local model_info via Hogwild!
 * Once the computation is done, reduce() will be called
 */
@Override
public void setupLocal() {
 super.setupLocal();
 _res = new DeepWaterTask(_sharedmodel, _sync_fraction, (Job)_jobKey.get());
 addToPendingCount(1);
 _res.dfork(null, _fr, true /*run_local*/);
}

 /**
  * Finish up the work after all nodes have reduced their models via the above reduce() method.
  * All we do is average the models and add to the global training sample counter.
  * After this returns, model_info() can be queried for the updated model.
  */
 @Override
 protected void postGlobal() {
  assert(_res.model_info().get_params()._replicate_training_data);
  super.postGlobal();
  // model averaging (DeepLearningTask only computed the per-node models, each on all the data)
  _res.model_info().div(_res._chunk_node_count);
  _res.model_info().add_processed_global(_res.model_info().get_processed_local()); //switch from local counters to global counters
  _res.model_info().set_processed_local(0l);
  DeepLearningModelInfo nodeAverageModel = _res.model_info();
  if (nodeAverageModel.get_params()._elastic_averaging)
   _sharedmodel = DeepLearningModelInfo.timeAverage(nodeAverageModel);
  else
   _sharedmodel = nodeAverageModel;
 }
}

for( int i=0; i<ratios.length; i++ ) {
 Vec[] nvecs = new Vec[ncols];
 final int rowLayout = mr.appendables()[i*ncols].compute_rowLayout();
 for( int c=0; c<ncols; c++ ) {
  AppendableVec av = mr.appendables()[i*ncols + c];
  av.setDomain(vecs[c].domain());
  nvecs[c] = av.close(rowLayout,fs);

public static Vec[] makeStratifiedWeights(Vec responseVec, final double trainRatio, final double[] weightMult) {
 Vec[] weights = new Vec[2];
 long seed = getRNG(new Random().nextLong()).nextLong();
 final int nClass = responseVec.domain().length;
 final long[] seeds = new long[nClass]; // seed for each regular fold column (one per class)
 for( int i=0;i<nClass;++i)
  seeds[i] = getRNG(seed + i).nextLong();
 weights[0] = new MRTask() {
  private boolean isTest(int row, long seed) { return getRNG(row+seed).nextDouble() > trainRatio; }
  @Override public void map(Chunk y, NewChunk ss) { // 1-> train, 0-> test
   int start = (int)y.start();
   int yval;
   for(int classLabel=0; classLabel<nClass; ++classLabel)
    for(int row=0;row<y._len;++row)
     if( (yval=(int)y.at8(row)) == classLabel )
      ss.addNum( isTest(start+row,seeds[classLabel])?0:(weightMult==null?1:weightMult[yval]));
  }
 }.doAll(Vec.T_NUM, responseVec).outputFrame().anyVec();
 if( null!=weightMult ) {
  new MRTask() {
   @Override public void map(Chunk[] cs) {
    for(int i=0; i<cs[0]._len; ++i) {
     if (0 == cs[1].at8(i)) cs[2].set(i,weightMult[(int) cs[0].at8(i)]);
    }
   }
  }.doAll(responseVec,weights[0],weights[1]=weights[0].makeZero());
 }
 return weights;
}

public Frame transform(Vec wordVec, AggregateMethod aggregateMethod) {
 if (wordVec.get_type() != Vec.T_STR) {
  throw new IllegalArgumentException("Expected a string vector, got " + wordVec.get_type_str() + " vector.");
 }
 byte[] types = new byte[_output._vecSize];
 Arrays.fill(types, Vec.T_NUM);
 MRTask<?> transformTask = aggregateMethod == AggregateMethod.AVERAGE ?
     new Word2VecAggregateTask(this) : new Word2VecTransformTask(this);
 return transformTask.doAll(types, wordVec).outputFrame(Key.<Frame>make(), null, null);
}

 @Override
 public void reduce(GLMCoordinateDescentTaskSeqNaive git){
  ArrayUtils.add(_temp, git._temp);
  _nobs += git._nobs;
  super.reduce(git);
 }
}

@Override
protected void setupLocal() {
  super.setupLocal();
  enabled = ExtensionManager.getInstance().isCoreExtensionEnabled(XGBoostExtension.NAME);
}

 /**
  * Finish up the work after all nodes have reduced their models via the above reduce() method.
  * All we do is average the models and add to the global training sample counter.
  * After this returns, model_info() can be queried for the updated model.
  */
 @Override
 protected void postGlobal() {
  assert(_res.model_info().get_params()._replicate_training_data);
  super.postGlobal();
  // model averaging (DeepWaterTask only computed the per-node models, each on all the data)
//    _res.model_info().div(_res._chunk_node_count);
  _res.model_info().add_processed_global(_res.model_info().get_processed_local()); //switch from local counters to global counters
  _res.model_info().set_processed_local(0L);
  _sharedmodel = _res.model_info();
 }
}

public Frame createMappingOfExemplars(Key destinationKey){
 final long[] keep = MemoryManager.malloc8(_exemplars.length);
 for (int i=0;i<keep.length;++i)
  keep[i]=_exemplars[i].gid;
 Vec exAssignment = _exemplar_assignment_vec_key.get();
 Arrays.sort(keep);
 Vec exemplarAssignment = new MRTask() {
  @Override
  public void map(Chunk c1, NewChunk nc) {
   for (int i = 0; i < c1._len; i++) {
    long gid = c1.at8(i);
    nc.addNum(ArrayUtils.find(keep, gid));
   }
  }
 }.doAll(Vec.T_NUM,exAssignment).outputFrame().vec(0);
 Frame mapping = new Frame(destinationKey,new String[]{"exemplar_assignment"}, new Vec[]{exemplarAssignment});
 final long[] uniqueExemplars = new VecUtils.CollectIntegerDomain().doAll(mapping.vecs()).domain();
 assert(uniqueExemplars.length==_exemplars.length);
 assert(mapping.numRows()==exAssignment.length());
 for(long exmp: uniqueExemplars){
  assert(exmp <= _exemplars.length);
 }
 DKV.put(mapping);
 return mapping;
}

@Override
public void reduce(GLMGenerateWeightsTask git){ // adding contribution of all the chunks
 ArrayUtils.add(denums, git.denums);
 wsum+=git.wsum;
 wsumu += git.wsumu;
 _likelihood += git._likelihood;
 super.reduce(git);
}

/**
 * Do the local computation: Perform one DeepLearningTask (with run_local=true) iteration.
 * Pass over all the data (will be replicated in dfork() here), and use _sync_fraction random rows.
 * This calls DeepLearningTask's reduce() between worker threads that update the same local model_info via Hogwild!
 * Once the computation is done, reduce() will be called
 */
@Override
public void setupLocal() {
 super.setupLocal();
 _res = new DeepLearningTask(_jobKey, _sharedmodel, _sync_fraction, _iteration, this);
 addToPendingCount(1);
 _res.dfork(null, _fr, true /*run_local*/);
}

 /**
  * Finish up the work after all nodes have reduced their models via the above reduce() method.
  * All we do is average the models and add to the global training sample counter.
  * After this returns, model_info() can be queried for the updated model.
  */
 @Override
 protected void postGlobal() {
  assert(_res.model_info().get_params()._replicate_training_data);
  super.postGlobal();
  // model averaging (DeepLearningTask only computed the per-node models, each on all the data)
  _res.model_info().div(_res._chunk_node_count);
  _res.model_info().add_processed_global(_res.model_info().get_processed_local()); //switch from local counters to global counters
  _res.model_info().set_processed_local(0l);
  DeepLearningModelInfo nodeAverageModel = _res.model_info();
  if (nodeAverageModel.get_params()._elastic_averaging)
   _sharedmodel = DeepLearningModelInfo.timeAverage(nodeAverageModel);
  else
   _sharedmodel = nodeAverageModel;
 }
}

 }.doAll(noutputs,Vec.T_NUM,dinfo._adaptedFrame).outputFrame(Key.make(),names,null);
} else {
 byte[] types = new byte[dinfo.fullN()];
 }.doAll(types, dinfo._adaptedFrame.vecs()).outputFrame(Key.make("OneHot"+Key.make().toString()), dinfo.coefNames(), null);

Javadoc

Map/Reduce style distributed computation.

Most used methods

Popular in Java

Updating database using SQL prepared statement
setScale (BigDecimal)
findViewById (Activity)
compareTo (BigDecimal)
Time (java.sql)
Java representation of an SQL TIME value. Provides utilities to format and parse the time's represen
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
HashMap (java.util)
HashMap is an implementation of Map. All optional operations are supported.All elements are permitte
TimerTask (java.util)
The TimerTask class represents a task to run at a specified time. The task may be run once or repeat
Options (org.apache.commons.cli)
Main entry-point into the library. Options represents a collection of Option objects, which describ
BufferedImage (java.awt.image)
The BufferedImage subclass describes an java.awt.Image with an accessible buffer of image data. All
Best IntelliJ plugins

How to useMRTask in water

Best Java code snippets using water.MRTask (Showing top 20 results out of 315)

How to use
MRTask
in
water