/** * A minor shortcut for applying a bitmask to * a matrix * @param arr The array to apply the mask to * @param mask the mask to apply * @return the array with the mask applied */ public static INDArray applyMask(INDArray arr,INDArray mask) { return arr.mul(mask); }
private INDArray scoreArray(INDArray labels, INDArray preOutput, IActivation activationFn, INDArray mask) { INDArray scoreArr; INDArray output = activationFn.getActivation(preOutput.dup(), true); INDArray yMinusyHat = Transforms.abs(labels.sub(output)); scoreArr = yMinusyHat.mul(yMinusyHat); scoreArr = scoreArr.mul(trainMask); if (mask != null) { scoreArr.muliColumnVector(mask); } return scoreArr; }
@Override public void applyUpdater(INDArray gradient, int iteration, int epoch) { if (lastGradient == null) throw new IllegalStateException("Updater has not been initialized with view state"); double learningRate = config.getLearningRate(iteration, epoch); double rmsDecay = config.getRmsDecay(); double epsilon = config.getEpsilon(); lastGradient.muli(rmsDecay).addi(gradient.mul(gradient).muli(1 - rmsDecay)); // lr * gradient / (sqrt(cache) + 1e-8) gradient.muli(learningRate).divi(Transforms.sqrt(lastGradient.dup(gradientReshapeOrder), false).addi(epsilon)); } }
/** * This method calculates 'phi' which is the probability * density function (see Bishop 23) * @param diffSquared This is the 'x-mu' term of the Gaussian distribution (distance between 'x' and the mean value of the distribution). * @param sigma This is the standard deviation of the Gaussian distribution. * @return This returns an array of shape [nsamples, nlabels, ndistributions] which contains the probability density (phi) for each of the * samples * labels * distributions for the given x, sigma, mu. */ private INDArray phi(INDArray diffSquared, INDArray sigma) { // 1/sqrt(2PIs^2) * e^((in-u)^2/2*s^2) INDArray minustwovariance = sigma.mul(sigma).muli(2).negi(); // This is phi_i(x,mu,sigma) INDArray likelihoods = Transforms.exp(diffSquared.divi(minustwovariance)) .divi(Transforms.pow(sigma.mul(SQRT_TWO_PI), (double) mLabelWidth)); return likelihoods; }
static INDArray randInt(int [] shape, int upper) { return Transforms.floor(Nd4j.rand(shape).mul(upper)); }
@Override public INDArray computeGradient(INDArray labels, INDArray preOutput, IActivation activationFn, INDArray mask) { INDArray output = activationFn.getActivation(preOutput.dup(), true); INDArray yMinusyHat = labels.sub(output); INDArray dldyhat = yMinusyHat.mul(-2); INDArray gradients = activationFn.backprop(preOutput.dup(), dldyhat).getFirst(); gradients = gradients.mul(trainMask); //multiply with masks, always if (mask != null) { gradients.muliColumnVector(mask); } return gradients; }
/** * * @param relStep * @param x * @return */ public static INDArray computeAbsoluteStep(INDArray relStep,INDArray x) { if(relStep == null) { relStep = pow(Nd4j.scalar(getEpsRelativeTo(x)),0.5); } INDArray signX0 = x.gte(0).muli(2).subi(1); return signX0.mul(relStep).muli(max(abs(x),1.0)); }
private void validateData(INDArray label, INDArray labelMask) { if (label.rank() != 3) { throw new IllegalArgumentException( "UnderSamplingByMaskingPreProcessor can only be applied to a time series dataset"); } if (label.size(1) > 2) { throw new IllegalArgumentException( "UnderSamplingByMaskingPreProcessor can only be applied to labels that represent binary classes. Label size was found to be " + label.size(1) + ".Expecting size=1 or size=2."); } if (label.size(1) == 2) { //check if label is of size one hot if (!label.sum(1).mul(labelMask).equals(labelMask)) { throw new IllegalArgumentException("Labels of size minibatchx2xtimesteps are expected to be one hot." + label.toString() + "\n is not one-hot"); } } }
/** * Gets feature specific learning rates * Adagrad keeps a history of gradients being passed in. * Note that each gradient passed in becomes adapted over time, hence * the opName adagrad * * @param gradient the gradient to get learning rates for * @param iteration * @return the feature specific learning rates */ public INDArray getGradient(INDArray gradient, int iteration) { if (historicalGradient == null) throw new IllegalStateException("Updater has not been initialized with view state"); historicalGradient.addi(gradient.mul(gradient)); INDArray sqrtHistory = sqrt(historicalGradient.dup(gradientReshapeOrder), false).addi(epsilon); // lr * gradient / (sqrt(sumSquaredGradients) + epsilon) INDArray ret = gradient.muli(sqrtHistory.rdivi(learningRate)); numIterations++; return ret; }
@Override public Pair<INDArray, INDArray> backprop(INDArray in, INDArray epsilon) { /* //libnd4j only returns diagonal elements, fix in libnd4j? //derivative of softmax(in) shape = minibatchxclasses should give minibatch x classes x classes int miniBatchSize = in.shape()[0]; int classSize = in.shape()[1]; //if (in.rank() != 2) throw exception? INDArray z = Nd4j.zeros(miniBatchSize,classSize,classSize); INDArray i = Nd4j.eye(classSize); INDArray out = z.dup(); //identity matrix extended to 3d Nd4j.getExecutioner().execAndReturn(new BroadcastAddOp(z,i,out,new int[] {1,2})); //D_jS_j = S_i * (delta_ij - S_j) Nd4j.getExecutioner().execAndReturn(new BroadcastSubOp(out,in,z,new int[] {0,1}));//1-p or -p Nd4j.getExecutioner().execAndReturn(new BroadcastMulOp(z,in,out,new int[] {0,1}));//p*(1-p) or -pi*pj gradient = out; */ //use loss fn utils and push this for next release // Nd4j.getExecutioner().execAndReturn(new SoftMax(in).derivative()); // return in; INDArray out = Nd4j.getExecutioner().execAndReturn(new OldSoftMax(in)); INDArray x = out.mul(epsilon).sum(1); INDArray dLdz = out.mul(epsilon.subColumnVector(x)); return new Pair<>(dLdz, null); }
@Override public void exec() { //TODO add dimension arg. For now: hardcoded along dimension 1... INDArray softmax = Transforms.softmax(x, true); INDArray mul = softmax.mul(y); INDArray summed = mul.sum(1); Nd4j.getExecutioner().exec(new BroadcastSubOp(y,summed,z,0)); }
/** * This method returns an array consisting of each of the training samples, * for each label in each sample, the negative log likelihood of that * value falling within the given gaussian mixtures. * @param alpha * @param mu * @param sigma * @param labels * @return */ private INDArray negativeLogLikelihood(INDArray labels, INDArray alpha, INDArray mu, INDArray sigma) { INDArray labelsMinusMu = labelsMinusMu(labels, mu); INDArray diffsquared = labelsMinusMu.mul(labelsMinusMu).sum(2); INDArray phitimesalphasum = phi(diffsquared, sigma).muli(alpha).sum(1); // result = See Bishop(28,29) INDArray result = Transforms.log(phitimesalphasum).negi(); return result; }
public static boolean checkMulManually(INDArray first, INDArray second, double maxRelativeDifference, double minAbsDifference) { //No apache commons element-wise multiply, but can do this manually INDArray result = first.mul(second); long[] shape = first.shape(); INDArray expected = Nd4j.zeros(first.shape()); for (int i = 0; i < shape[0]; i++) { for (int j = 0; j < shape[1]; j++) { double v = first.getDouble(i, j) * second.getDouble(i, j); expected.putScalar(new int[] {i, j}, v); } } if (!checkShape(expected, result)) return false; boolean ok = checkEntries(expected, result, maxRelativeDifference, minAbsDifference); if (!ok) { INDArray onCopies = Shape.toOffsetZeroCopy(first).mul(Shape.toOffsetZeroCopy(second)); printFailureDetails(first, second, expected, result, onCopies, "mul"); } return ok; }
/** * Gets feature specific learning rates * Adagrad keeps a history of gradients being passed in. * Note that each gradient passed in becomes adapted over time, hence the opName adagrad * * @param gradient the gradient to get learning rates for * @param iteration */ @Override public void applyUpdater(INDArray gradient, int iteration, int epoch) { if (historicalGradient == null) throw new IllegalStateException("Updater has not been initialized with view state"); double learningRate = config.getLearningRate(iteration, epoch); double epsilon = config.getEpsilon(); historicalGradient.addi(gradient.mul(gradient)); INDArray sqrtHistory = sqrt(historicalGradient.dup(gradientReshapeOrder), false).addi(epsilon); // lr * gradient / (sqrt(sumSquaredGradients) + epsilon) gradient.muli(sqrtHistory.rdivi(learningRate)); } }
/** * Scales the ndarray columns * to the given min/max values * * @param min the minimum number * @param max the max number */ public static void scaleMinMax(double min, double max, INDArray toScale) { //X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) X_scaled = X_std * (max - min) + min INDArray min2 = toScale.min(0); INDArray max2 = toScale.max(0); INDArray std = toScale.subRowVector(min2).diviRowVector(max2.sub(min2)); INDArray scaled = std.mul(max - min).addi(min); toScale.assign(scaled); }
public INDArray getGradient(INDArray gradient, int slice, int[] shape) { boolean historicalInitialized = false; INDArray sqrtHistory; if (this.historicalGradient == null) { this.historicalGradient = Nd4j.zeros(shape).add(epsilon); historicalInitialized = true; } else if (!this.historicalGradient.isVector() && this.historicalGradient.slice(slice).length() != gradient.length()) throw new IllegalArgumentException("Illegal gradient"); if (historicalGradient.isVector()) sqrtHistory = sqrt(historicalGradient); else sqrtHistory = !historicalInitialized ? sqrt(historicalGradient.slice(slice)) : historicalGradient; INDArray learningRates; try { learningRates = sqrtHistory.rdivi(learningRate); } catch (ArithmeticException ae) { learningRates = sqrtHistory.rdivi(learningRate + epsilon); } if (gradient.length() != learningRates.length()) gradient.muli(learningRates.slice(slice)); else gradient.muli(learningRates); this.historicalGradient.slice(slice).addi(gradient.mul(gradient)); numIterations++; //ensure no zeros return gradient; }
@Override public INDArray getActivation(INDArray in, boolean training) { if (training) { try(MemoryWorkspace ws = Nd4j.getWorkspaceManager().scopeOutOfWorkspaces()) { this.alpha = Nd4j.rand(in.shape(), l, u, Nd4j.getRandom()); } INDArray inTimesAlpha = in.mul(alpha); BooleanIndexing.replaceWhere(in, inTimesAlpha, Conditions.lessThan(0)); } else { this.alpha = null; double a = 0.5 * (l + u); return Nd4j.getExecutioner().execAndReturn(new RectifedLinear(in, a)); } return in; }
@Override public INDArray sample(int[] shape) { int numRows = 1; for (int i = 0; i < shape.length - 1; i++) numRows *= shape[i]; int numCols = shape[shape.length - 1]; val flatShape = new int[]{numRows, numCols}; val flatRng = Nd4j.getExecutioner().exec(new GaussianDistribution(Nd4j.createUninitialized(flatShape, Nd4j.order()), 0.0, 1.0), random); long m = flatRng.rows(); long n = flatRng.columns(); val s = Nd4j.create(m < n ? m : n); val u = m < n ? Nd4j.create(m, n) : Nd4j.create(m, m); val v = Nd4j.create(n, n, 'f'); Nd4j.getBlasWrapper().lapack().gesvd(flatRng, s, u, v); // FIXME: int cast if (gains == null) { if (u.rows() == numRows && u.columns() == numCols) { return v.get(NDArrayIndex.interval(0, numRows), NDArrayIndex.interval(0, numCols)).mul(gain).reshape(ArrayUtil.toLongArray(shape)); } else { return u.get(NDArrayIndex.interval(0, numRows), NDArrayIndex.interval(0, numCols)).mul(gain).reshape(ArrayUtil.toLongArray(shape)); } } else { throw new UnsupportedOperationException(); } }
public static boolean checkDivManually(INDArray first, INDArray second, double maxRelativeDifference, double minAbsDifference) { //No apache commons element-wise division, but can do this manually INDArray result = first.div(second); long[] shape = first.shape(); INDArray expected = Nd4j.zeros(first.shape()); for (int i = 0; i < shape[0]; i++) { for (int j = 0; j < shape[1]; j++) { double v = first.getDouble(i, j) / second.getDouble(i, j); expected.putScalar(new int[] {i, j}, v); } } if (!checkShape(expected, result)) return false; boolean ok = checkEntries(expected, result, maxRelativeDifference, minAbsDifference); if (!ok) { INDArray onCopies = Shape.toOffsetZeroCopy(first).mul(Shape.toOffsetZeroCopy(second)); printFailureDetails(first, second, expected, result, onCopies, "div"); } return ok; }