/** * This method calculates 'phi' which is the probability * density function (see Bishop 23) * @param diffSquared This is the 'x-mu' term of the Gaussian distribution (distance between 'x' and the mean value of the distribution). * @param sigma This is the standard deviation of the Gaussian distribution. * @return This returns an array of shape [nsamples, nlabels, ndistributions] which contains the probability density (phi) for each of the * samples * labels * distributions for the given x, sigma, mu. */ private INDArray phi(INDArray diffSquared, INDArray sigma) { // 1/sqrt(2PIs^2) * e^((in-u)^2/2*s^2) INDArray minustwovariance = sigma.mul(sigma).muli(2).negi(); // This is phi_i(x,mu,sigma) INDArray likelihoods = Transforms.exp(diffSquared.divi(minustwovariance)) .divi(Transforms.pow(sigma.mul(SQRT_TWO_PI), (double) mLabelWidth)); return likelihoods; }
/** * This method returns an array consisting of each of the training samples, * for each label in each sample, the negative log likelihood of that * value falling within the given gaussian mixtures. * @param alpha * @param mu * @param sigma * @param labels * @return */ private INDArray negativeLogLikelihood(INDArray labels, INDArray alpha, INDArray mu, INDArray sigma) { INDArray labelsMinusMu = labelsMinusMu(labels, mu); INDArray diffsquared = labelsMinusMu.mul(labelsMinusMu).sum(2); INDArray phitimesalphasum = phi(diffsquared, sigma).muli(alpha).sum(1); // result = See Bishop(28,29) INDArray result = Transforms.log(phitimesalphasum).negi(); return result; }
@Override public INDArray computeGradient(INDArray labels, INDArray preOutput, IActivation activationFn, INDArray mask) { if (labels.size(1) != preOutput.size(1)) { throw new IllegalArgumentException( "Labels array numColumns (size(1) = " + labels.size(1) + ") does not match output layer" + " number of outputs (nOut = " + preOutput.size(1) + ") "); } //INDArray output = Nd4j.getExecutioner().execAndReturn(Nd4j.getOpFactory().createTransform(activationFn, preOutput.dup())); INDArray output = activationFn.getActivation(preOutput.dup(), true); INDArray dLda = labels.div(output).negi(); if (mask != null && LossUtil.isPerOutputMasking(dLda, mask)) { //For *most* activation functions: we don't actually need to mask dL/da in addition to masking dL/dz later //but: some, like softmax, require both (due to dL/dz_i being a function of dL/da_j, for i != j) //We could add a special case for softmax (activationFn instanceof ActivationSoftmax) but that would be // error prone - though buy us a tiny bit of performance LossUtil.applyMask(dLda, mask); } INDArray grad = activationFn.backprop(preOutput, dLda).getFirst(); //TODO activation functions with params if (mask != null) { LossUtil.applyMask(grad, mask); } return grad; }
INDArray dLda = output.rdivi(labels).negi();
@Override public INDArray computeGradient(INDArray labels, INDArray preOutput, IActivation activationFn, INDArray mask) { double[] d = computeScoreNumDenom(labels, preOutput, activationFn, mask, false); double numerator = d[0]; double denominator = d[1]; if (numerator == 0.0 && denominator == 0.0) { //Zero score -> zero gradient return Nd4j.create(preOutput.shape()); } double secondTerm = numerator / (denominator * denominator); INDArray dLdOut; if (labels.size(1) == 1) { //Single binary output case dLdOut = labels.mul(1 + beta * beta).divi(denominator).subi(secondTerm); } else { //Softmax case: the getColumn(1) here is to account for the fact that we're using prob(class1) // only in the score function; column(1) is equivalent to output for the single output case dLdOut = Nd4j.create(labels.shape()); dLdOut.getColumn(1).assign(labels.getColumn(1).mul(1 + beta * beta).divi(denominator).subi(secondTerm)); } //Negate relative to description in paper, as we want to *minimize* 1.0-fMeasure, which is equivalent to // maximizing fMeasure dLdOut.negi(); INDArray dLdPreOut = activationFn.backprop(preOutput, dLdOut).getFirst(); if (mask != null) { dLdPreOut.muliColumnVector(mask); } return dLdPreOut; }
gradientOutput.getRow(i).assign(classificationDifferences.sum(0).addi(classificationDifferences.sum(1).transposei().negi()));
INDArray minustwovariance = variance.mul(2).negi(); INDArray normalPart = mdc.alpha.div(Transforms.pow(mdc.sigma.mul(SQRT_TWO_PI), mLabelWidth)); INDArray exponent = labelsMinusMuSquared.div(minustwovariance); dLdZMu.put(new INDArrayIndex[] {NDArrayIndex.all(), NDArrayIndex.all(), NDArrayIndex.point(k)}, labelsMinusMu.get(new INDArrayIndex[] {NDArrayIndex.all(), NDArrayIndex.all(), NDArrayIndex.point(k)}).muli(pi).divi(variance).negi());
@Override public DoubleTensor unaryMinusInPlace() { tensor.negi(); return this; }
@Override public IntegerTensor unaryMinusInPlace() { tensor.negi(); return this; }
@Override public INDArray exampleNegLogProbability(INDArray x, INDArray preOutDistributionParams) { INDArray logProb = calcLogProbArray(x, preOutDistributionParams); return logProb.sum(1).negi(); }
@Override public INDArray exampleNegLogProbability(INDArray x, INDArray preOutDistributionParams) { INDArray gamma = preOutDistributionParams.dup(); activationFn.getActivation(gamma, false); INDArray lambda = Transforms.exp(gamma, true); return lambda.muli(x).rsubi(gamma).sum(1).negi(); }
@Override public INDArray generateRandom(INDArray preOutDistributionParams) { INDArray gamma = activationFn.getActivation(preOutDistributionParams.dup(), false); INDArray lambda = Transforms.exp(gamma, true); //Inverse cumulative distribution function: -log(1-p)/lambda INDArray u = Nd4j.rand(preOutDistributionParams.shape()); //Note here: if u ~ U(0,1) then 1-u ~ U(0,1) return Transforms.log(u, false).divi(lambda).negi(); }
/** * This method calculates 'phi' which is the probability * density function (see Bishop 23) * @param diffSquared This is the 'x-mu' term of the Gaussian distribution (distance between 'x' and the mean value of the distribution). * @param sigma This is the standard deviation of the Gaussian distribution. * @return This returns an array of shape [nsamples, nlabels, ndistributions] which contains the probability density (phi) for each of the * samples * labels * distributions for the given x, sigma, mu. */ private INDArray phi(INDArray diffSquared, INDArray sigma) { // 1/sqrt(2PIs^2) * e^((in-u)^2/2*s^2) INDArray minustwovariance = sigma.mul(sigma).muli(2).negi(); // This is phi_i(x,mu,sigma) INDArray likelihoods = Transforms.exp(diffSquared.divi(minustwovariance)) .divi(Transforms.pow(sigma.mul(SQRT_TWO_PI), (double)mLabelWidth)); return likelihoods; }
/** * This method returns an array consisting of each of the training samples, * for each label in each sample, the negative log likelihood of that * value falling within the given gaussian mixtures. * @param alpha * @param mu * @param sigma * @param labels * @return */ private INDArray negativeLogLikelihood(INDArray labels, INDArray alpha, INDArray mu, INDArray sigma) { INDArray labelsMinusMu = labelsMinusMu(labels, mu); INDArray diffsquared = labelsMinusMu.mul(labelsMinusMu).sum(2); INDArray phitimesalphasum = phi(diffsquared, sigma).muli(alpha).sum(1); // result = See Bishop(28,29) INDArray result = Transforms.log( phitimesalphasum ).negi(); return result; }
@Override public INDArray computeGradient(INDArray labels, INDArray preOutput, IActivation activationFn, INDArray mask) { if (labels.size(1) != preOutput.size(1)) { throw new IllegalArgumentException("Labels array numColumns (size(1) = " + labels.size(1) + ") does not match output layer" + " number of outputs (nOut = " + preOutput.size(1) + ") "); } //INDArray output = Nd4j.getExecutioner().execAndReturn(Nd4j.getOpFactory().createTransform(activationFn, preOutput.dup())); INDArray output = activationFn.getActivation(preOutput.dup(), true); INDArray dLda = labels.div(output).negi(); if(mask != null && LossUtil.isPerOutputMasking(dLda, mask)){ //For *most* activation functions: we don't actually need to mask dL/da in addition to masking dL/dz later //but: some, like softmax, require both (due to dL/dz_i being a function of dL/da_j, for i != j) //We could add a special case for softmax (activationFn instanceof ActivationSoftmax) but that would be // error prone - though buy us a tiny bit of performance LossUtil.applyMask(dLda, mask); } INDArray grad = activationFn.backprop(preOutput, dLda).getFirst(); //TODO activation functions with params if (mask != null) { LossUtil.applyMask(grad, mask); } return grad; }
@Override public INDArray gradient(INDArray x, INDArray preOutDistributionParams) { INDArray output = preOutDistributionParams.dup(); activationFn.getActivation(output, true); INDArray diff = x.sub(output); INDArray outOneMinusOut = output.rsub(1.0).muli(output); INDArray grad = diff.divi(outOneMinusOut); grad = activationFn.backprop(preOutDistributionParams.dup(), grad).getFirst(); //Issue: if output == 0 or output == 1, then (assuming sigmoid output or similar) //sigmaPrime == 0, sigmaPrime * (x-out) / (out*(1-out)) == 0 * (x-out) / 0 -> 0/0 -> NaN. But taking limit, we want //0*(x-out)/0 == 0 -> implies 0 gradient at the far extremes (0 or 1) of the output BooleanIndexing.replaceWhere(grad, 0.0, Conditions.isNan()); return grad.negi(); }
@Override public DoubleTensor setWithMaskInPlace(DoubleTensor mask, Double value) { if (this.getLength() != mask.getLength()) { throw new IllegalArgumentException("The lengths of the tensor and mask must match, but got tensor length: " + this.getLength() + ", mask length: " + mask.getLength()); } INDArray maskDup = unsafeGetNd4J(mask).dup(); double trueValue = 1.0; if (value == 0.0) { trueValue = 1.0 - trueValue; maskDup.negi().addi(1); } double falseValue = 1.0 - trueValue; Nd4j.getExecutioner().exec( new CompareAndSet(maskDup, value, Conditions.equals(trueValue)) ); Nd4j.getExecutioner().exec( new CompareAndSet(tensor, maskDup, Conditions.notEquals(falseValue)) ); return this; }
public INDArray computeDlDx(INDArray labels, INDArray predictedScores) { INDArray mPluss = Nd4j.zeros(predictedScores.shape()).addi(mPlus); INDArray mMinuss = Nd4j.zeros(predictedScores.shape()).addi(mMinus); INDArray leftHandExp = Transforms.exp((mPluss.subi(predictedScores)).muli(gamma)); INDArray leftHandNumerator = leftHandExp.mul(gamma); INDArray leftHandDenominator = leftHandExp.addi(1); INDArray leftHand = leftHandNumerator.divi(leftHandDenominator); leftHand.negi(); INDArray rightHandExp = Transforms.exp((mMinuss.addi(predictedScores)).muli(gamma)); INDArray rightHandNumerator = rightHandExp.mul(gamma); INDArray rightHandDenominator = rightHandExp.addi(1); INDArray rightHand = rightHandNumerator.divi(rightHandDenominator); predictedScores = predictedScores.addi(labels.mul(positiveClassExclusionFactor)); INDArray maxNegative = predictedScores.argMax(1); INDArray negWithHighestScoreMask = Nd4j.zeros(predictedScores.shape()); for(int i = 0; i < maxNegative.length(); i++) { int index = maxNegative.getInt(i); negWithHighestScoreMask.put(i, index, 1); } leftHand = leftHand.muli(labels); rightHand = rightHand.muli(negWithHighestScoreMask); return leftHand.addi(rightHand); }
@Override public INDArray gradient(INDArray x, INDArray preOutDistributionParams) { INDArray output = preOutDistributionParams.dup(); activationFn.getActivation(output, true); int size = output.size(1) / 2; INDArray mean = output.get(NDArrayIndex.all(), NDArrayIndex.interval(0, size)); INDArray logStdevSquared = output.get(NDArrayIndex.all(), NDArrayIndex.interval(size, 2 * size)); INDArray sigmaSquared = Transforms.exp(logStdevSquared, true); INDArray xSubMean = x.sub(mean); INDArray xSubMeanSq = xSubMean.mul(xSubMean); INDArray dLdmu = xSubMean.divi(sigmaSquared); INDArray sigma = Transforms.sqrt(sigmaSquared, true); INDArray sigma3 = Transforms.pow(sigmaSquared, 3.0 / 2); INDArray dLdsigma = sigma.rdiv(-1).addi(xSubMeanSq.divi(sigma3)); INDArray dLdlogSigma2 = sigma.divi(2).muli(dLdsigma); INDArray dLdx = Nd4j.createUninitialized(output.shape()); dLdx.put(new INDArrayIndex[] {NDArrayIndex.all(), NDArrayIndex.interval(0, size)}, dLdmu); dLdx.put(new INDArrayIndex[] {NDArrayIndex.all(), NDArrayIndex.interval(size, 2 * size)}, dLdlogSigma2); dLdx.negi(); //dL/dz return activationFn.backprop(preOutDistributionParams.dup(), dLdx).getFirst(); }
public INDArray computeDlDx(INDArray labels, INDArray predictedScores) { INDArray mPluss = Nd4j.zeros(predictedScores.shape()).addi(mPlus); INDArray mMinuss = Nd4j.zeros(predictedScores.shape()).addi(mMinus); INDArray gammaDivNumCorrectLabels = Transforms.max(labels.sum(1), 1).rdivi(gamma); INDArray positiveExamples = predictedScores.mul(labels); INDArray leftHandExp = Transforms.exp((positiveExamples.sum(1).rsubi(mPlus)).muli(gammaDivNumCorrectLabels)); INDArray leftHandNumerator = leftHandExp.mulColumnVector(gammaDivNumCorrectLabels); INDArray leftHandDenominator = leftHandExp.addi(1); INDArray leftHand = leftHandNumerator.divi(leftHandDenominator); leftHand = labels.mulColumnVector(leftHand.negi()); INDArray rightHandExp = Transforms.exp((mMinuss.addi(predictedScores)).muli(gamma)); INDArray rightHandNumerator = rightHandExp.mul(gamma); INDArray rightHandDenominator = rightHandExp.addi(1); INDArray rightHand = rightHandNumerator.divi(rightHandDenominator); predictedScores = predictedScores.addi(labels.mul(positiveClassExclusionFactor)); INDArray maxNegative = predictedScores.argMax(1); INDArray negWithHighestScoreMask = Nd4j.zeros(predictedScores.shape()); for(int i = 0; i < maxNegative.length(); i++) { int index = maxNegative.getInt(i); negWithHighestScoreMask.put(i, index, 1); } leftHand = leftHand.muli(labels); rightHand = rightHand.muli(negWithHighestScoreMask); rightHand = rightHand.mulColumnVector(Transforms.min(labels.sum(1),1)); // ignore examples without correct label. return leftHand.addi(rightHand); }