public static Vec[] makeWeights(Vec responseVec, final double trainRatio, final double[] weightMult) { Vec[] weights = new Vec[2]; weights[0] = responseVec.makeZero(); final long seed = water.util.RandomUtils.getRNG(new Random().nextLong()).nextLong(); new MRTask() { @Override public void map(Chunk[] c){ long start = c[0].start(); Random rng = new water.util.RandomUtils.PCGRNG(start,1); int yval; for(int i = 0; i < c[0]._len; ++i) { yval=(int)c[0].at8(i); rng.setSeed(seed+start+i); // Determinstic per-row c[1].set(i, rng.nextFloat() < trainRatio ? (weightMult==null?1:weightMult[yval]) : 0); } } }.doAll(responseVec,weights[0]); if( null!=weightMult ) { weights[1] = new MRTask() { @Override public void map(Chunk[] cs, NewChunk n) { for(int i=0;i<cs[0]._len;++i) { if (0 == cs[1].at8(i)) n.addNum(weightMult[(int) cs[0].at8(i)]); else n.addNum(0); } } }.doAll(Vec.T_NUM, new Frame(responseVec,weights[0])).outputFrame().anyVec(); } return weights; }
public Frame transform(Vec wordVec, AggregateMethod aggregateMethod) { if (wordVec.get_type() != Vec.T_STR) { throw new IllegalArgumentException("Expected a string vector, got " + wordVec.get_type_str() + " vector."); } byte[] types = new byte[_output._vecSize]; Arrays.fill(types, Vec.T_NUM); MRTask<?> transformTask = aggregateMethod == AggregateMethod.AVERAGE ? new Word2VecAggregateTask(this) : new Word2VecTransformTask(this); return transformTask.doAll(types, wordVec).outputFrame(Key.<Frame>make(), null, null); }
@Override public void reduce(GLMCoordinateDescentTaskSeqIntercept git){ _temp+= git._temp; super.reduce(git); }
/** * Do the local computation: Perform one DeepLearningTask (with run_local=true) iteration. * Pass over all the data (will be replicated in dfork() here), and use _sync_fraction random rows. * This calls DeepLearningTask's reduce() between worker threads that update the same local model_info via Hogwild! * Once the computation is done, reduce() will be called */ @Override public void setupLocal() { super.setupLocal(); _res = new DeepLearningTask(_jobKey, _sharedmodel, _sync_fraction, _iteration, this); addToPendingCount(1); _res.dfork(null, _fr, true /*run_local*/); }
/** Simple testing scenario, splitting frame in the middle and comparing the values */ static void testScenario(Frame f, String[] expValues, MRTask chunkAssertions) { double[] ratios = ard(0.5, 0.5); Key<Frame>[] keys = aro(Key.<Frame>make("test.hex"), Key.<Frame>make("train.hex")); Frame[] splits = null; try { splits = ShuffleSplitFrame.shuffleSplitFrame(f, keys, ratios, 42); Assert.assertEquals("Expecting 2 splits", 2, splits.length); // Collect values from both splits String[] values = append( collectS(splits[0].vec(0)), collectS(splits[1].vec(0))); // Sort values, but first replace all nulls by unique value Arrays.sort(replaceNulls(expValues)); Arrays.sort(replaceNulls(values)); Assert.assertArrayEquals("Values should match", expValues, values); if (chunkAssertions != null) { for (Frame s: splits) chunkAssertions.doAll(s).getResult(); } } finally { f.delete(); if (splits!=null) for(Frame s: splits) s.delete(); } }
/** * Finish up the work after all nodes have reduced their models via the above reduce() method. * All we do is average the models and add to the global training sample counter. * After this returns, model_info() can be queried for the updated model. */ @Override protected void postGlobal() { assert(_res.model_info().get_params()._replicate_training_data); super.postGlobal(); // model averaging (DeepWaterTask only computed the per-node models, each on all the data) // _res.model_info().div(_res._chunk_node_count); _res.model_info().add_processed_global(_res.model_info().get_processed_local()); //switch from local counters to global counters _res.model_info().set_processed_local(0L); _sharedmodel = _res.model_info(); } }
for( int i=0; i<ratios.length; i++ ) { Vec[] nvecs = new Vec[ncols]; final int rowLayout = mr.appendables()[i*ncols].compute_rowLayout(); for( int c=0; c<ncols; c++ ) { AppendableVec av = mr.appendables()[i*ncols + c]; av.setDomain(vecs[c].domain()); nvecs[c] = av.close(rowLayout,fs);
/** * Do the local computation: Perform one DeepWaterTask (with run_local=true) iteration. * Pass over all the data (will be replicated in dfork() here), and use _sync_fraction random rows. * This calls DeepWaterTask's reduce() between worker threads that update the same local model_info via Hogwild! * Once the computation is done, reduce() will be called */ @Override public void setupLocal() { super.setupLocal(); _res = new DeepWaterTask(_sharedmodel, _sync_fraction, (Job)_jobKey.get()); addToPendingCount(1); _res.dfork(null, _fr, true /*run_local*/); }
/** * Finish up the work after all nodes have reduced their models via the above reduce() method. * All we do is average the models and add to the global training sample counter. * After this returns, model_info() can be queried for the updated model. */ @Override protected void postGlobal() { assert(_res.model_info().get_params()._replicate_training_data); super.postGlobal(); // model averaging (DeepLearningTask only computed the per-node models, each on all the data) _res.model_info().div(_res._chunk_node_count); _res.model_info().add_processed_global(_res.model_info().get_processed_local()); //switch from local counters to global counters _res.model_info().set_processed_local(0l); DeepLearningModelInfo nodeAverageModel = _res.model_info(); if (nodeAverageModel.get_params()._elastic_averaging) _sharedmodel = DeepLearningModelInfo.timeAverage(nodeAverageModel); else _sharedmodel = nodeAverageModel; } }
for( int i=0; i<ratios.length; i++ ) { Vec[] nvecs = new Vec[ncols]; final int rowLayout = mr.appendables()[i*ncols].compute_rowLayout(); for( int c=0; c<ncols; c++ ) { AppendableVec av = mr.appendables()[i*ncols + c]; av.setDomain(vecs[c].domain()); nvecs[c] = av.close(rowLayout,fs);
public static Vec[] makeStratifiedWeights(Vec responseVec, final double trainRatio, final double[] weightMult) { Vec[] weights = new Vec[2]; long seed = getRNG(new Random().nextLong()).nextLong(); final int nClass = responseVec.domain().length; final long[] seeds = new long[nClass]; // seed for each regular fold column (one per class) for( int i=0;i<nClass;++i) seeds[i] = getRNG(seed + i).nextLong(); weights[0] = new MRTask() { private boolean isTest(int row, long seed) { return getRNG(row+seed).nextDouble() > trainRatio; } @Override public void map(Chunk y, NewChunk ss) { // 1-> train, 0-> test int start = (int)y.start(); int yval; for(int classLabel=0; classLabel<nClass; ++classLabel) for(int row=0;row<y._len;++row) if( (yval=(int)y.at8(row)) == classLabel ) ss.addNum( isTest(start+row,seeds[classLabel])?0:(weightMult==null?1:weightMult[yval])); } }.doAll(Vec.T_NUM, responseVec).outputFrame().anyVec(); if( null!=weightMult ) { new MRTask() { @Override public void map(Chunk[] cs) { for(int i=0; i<cs[0]._len; ++i) { if (0 == cs[1].at8(i)) cs[2].set(i,weightMult[(int) cs[0].at8(i)]); } } }.doAll(responseVec,weights[0],weights[1]=weights[0].makeZero()); } return weights; }
public Frame transform(Vec wordVec, AggregateMethod aggregateMethod) { if (wordVec.get_type() != Vec.T_STR) { throw new IllegalArgumentException("Expected a string vector, got " + wordVec.get_type_str() + " vector."); } byte[] types = new byte[_output._vecSize]; Arrays.fill(types, Vec.T_NUM); MRTask<?> transformTask = aggregateMethod == AggregateMethod.AVERAGE ? new Word2VecAggregateTask(this) : new Word2VecTransformTask(this); return transformTask.doAll(types, wordVec).outputFrame(Key.<Frame>make(), null, null); }
@Override public void reduce(GLMCoordinateDescentTaskSeqNaive git){ ArrayUtils.add(_temp, git._temp); _nobs += git._nobs; super.reduce(git); } }
@Override protected void setupLocal() { super.setupLocal(); enabled = ExtensionManager.getInstance().isCoreExtensionEnabled(XGBoostExtension.NAME); }
/** * Finish up the work after all nodes have reduced their models via the above reduce() method. * All we do is average the models and add to the global training sample counter. * After this returns, model_info() can be queried for the updated model. */ @Override protected void postGlobal() { assert(_res.model_info().get_params()._replicate_training_data); super.postGlobal(); // model averaging (DeepWaterTask only computed the per-node models, each on all the data) // _res.model_info().div(_res._chunk_node_count); _res.model_info().add_processed_global(_res.model_info().get_processed_local()); //switch from local counters to global counters _res.model_info().set_processed_local(0L); _sharedmodel = _res.model_info(); } }
public Frame createMappingOfExemplars(Key destinationKey){ final long[] keep = MemoryManager.malloc8(_exemplars.length); for (int i=0;i<keep.length;++i) keep[i]=_exemplars[i].gid; Vec exAssignment = _exemplar_assignment_vec_key.get(); Arrays.sort(keep); Vec exemplarAssignment = new MRTask() { @Override public void map(Chunk c1, NewChunk nc) { for (int i = 0; i < c1._len; i++) { long gid = c1.at8(i); nc.addNum(ArrayUtils.find(keep, gid)); } } }.doAll(Vec.T_NUM,exAssignment).outputFrame().vec(0); Frame mapping = new Frame(destinationKey,new String[]{"exemplar_assignment"}, new Vec[]{exemplarAssignment}); final long[] uniqueExemplars = new VecUtils.CollectIntegerDomain().doAll(mapping.vecs()).domain(); assert(uniqueExemplars.length==_exemplars.length); assert(mapping.numRows()==exAssignment.length()); for(long exmp: uniqueExemplars){ assert(exmp <= _exemplars.length); } DKV.put(mapping); return mapping; }
@Override public void reduce(GLMGenerateWeightsTask git){ // adding contribution of all the chunks ArrayUtils.add(denums, git.denums); wsum+=git.wsum; wsumu += git.wsumu; _likelihood += git._likelihood; super.reduce(git); }
/** * Do the local computation: Perform one DeepLearningTask (with run_local=true) iteration. * Pass over all the data (will be replicated in dfork() here), and use _sync_fraction random rows. * This calls DeepLearningTask's reduce() between worker threads that update the same local model_info via Hogwild! * Once the computation is done, reduce() will be called */ @Override public void setupLocal() { super.setupLocal(); _res = new DeepLearningTask(_jobKey, _sharedmodel, _sync_fraction, _iteration, this); addToPendingCount(1); _res.dfork(null, _fr, true /*run_local*/); }
/** * Finish up the work after all nodes have reduced their models via the above reduce() method. * All we do is average the models and add to the global training sample counter. * After this returns, model_info() can be queried for the updated model. */ @Override protected void postGlobal() { assert(_res.model_info().get_params()._replicate_training_data); super.postGlobal(); // model averaging (DeepLearningTask only computed the per-node models, each on all the data) _res.model_info().div(_res._chunk_node_count); _res.model_info().add_processed_global(_res.model_info().get_processed_local()); //switch from local counters to global counters _res.model_info().set_processed_local(0l); DeepLearningModelInfo nodeAverageModel = _res.model_info(); if (nodeAverageModel.get_params()._elastic_averaging) _sharedmodel = DeepLearningModelInfo.timeAverage(nodeAverageModel); else _sharedmodel = nodeAverageModel; } }
}.doAll(noutputs,Vec.T_NUM,dinfo._adaptedFrame).outputFrame(Key.make(),names,null); } else { byte[] types = new byte[dinfo.fullN()]; }.doAll(types, dinfo._adaptedFrame.vecs()).outputFrame(Key.make("OneHot"+Key.make().toString()), dinfo.coefNames(), null);