return minibatchSize * outputType.arrayElementsPerExample() * bytesPerElement; case ACTIVATION_GRADIENTS: if (memoryUseMode == MemoryUseMode.INFERENCE) { return minibatchSize * inputType.arrayElementsPerExample() * bytesPerElement; case UPDATER_STATE: if (memoryUseMode == MemoryUseMode.INFERENCE) {
@Override public LayerMemoryReport getMemoryReport(InputType inputType) { int actElementsPerEx = inputType.arrayElementsPerExample(); //During inference: not applied. During backprop: dup the input, in case it's used elsewhere //But: this will be counted in the activations //(technically inference memory is over-estimated as a result) return new LayerMemoryReport.Builder(layerName, DropoutLayer.class, inputType, inputType).standardMemory(0, 0) //No params .workingMemory(0, 0, 0, 0) //No working mem, other than activations etc .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching .build(); }
@Override public LayerMemoryReport getMemoryReport(InputType inputType) { int actElementsPerEx = inputType.arrayElementsPerExample(); //Forward pass: 3x input size as working memory, in addition to output activations //Backward pass: 2x input size as working memory, in addition to epsilons return new LayerMemoryReport.Builder(layerName, DenseLayer.class, inputType, inputType).standardMemory(0, 0) .workingMemory(0, 2 * actElementsPerEx, 0, 3 * actElementsPerEx) .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching in DenseLayer .build(); }
@Override public LayerMemoryReport getMemoryReport(InputType inputType) { int actElementsPerEx = inputType.arrayElementsPerExample(); return new LayerMemoryReport.Builder(layerName, ActivationLayer.class, inputType, inputType) .standardMemory(0, 0) //No params //During inference: modify input activation in-place //During backprop: dup the input for later re-use .workingMemory(0, 0, 0, actElementsPerEx) .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching .build(); }
@Override public LayerMemoryReport getMemoryReport(InputType inputType) { //Because of supervised + unsupervised modes: we'll assume unsupervised, which has the larger memory requirements InputType outputType = getOutputType(-1, inputType); int actElementsPerEx = outputType.arrayElementsPerExample() + inputType.arrayElementsPerExample(); int numParams = initializer().numParams(this); int updaterStateSize = (int) getIUpdater().stateSize(numParams); int trainSizePerEx = 0; if (getDropOut() > 0) { if (false) { //TODO drop connect //Dup the weights... note that this does NOT depend on the minibatch size... } else { //Assume we dup the input trainSizePerEx += inputType.arrayElementsPerExample(); } } //Also, during backprop: we do a preOut call -> gives us activations size equal to the output size // which is modified in-place by loss function trainSizePerEx += actElementsPerEx; return new LayerMemoryReport.Builder(layerName, AutoEncoder.class, inputType, outputType) .standardMemory(numParams, updaterStateSize).workingMemory(0, 0, 0, trainSizePerEx) .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching .build(); }
int actElementsPerEx = outputType.arrayElementsPerExample(); int unsupervisedPerEx = getK() * 2 * actElementsPerEx + inputType.arrayElementsPerExample(); int numParams = initializer().numParams(this); int updaterStateSize = (int) getIUpdater().stateSize(numParams); trainSizePerEx += inputType.arrayElementsPerExample();
@Override public MemoryReport getMemoryReport(InputType... inputTypes) { InputType outputType = getOutputType(-1, inputTypes); //Inference: only calculation is for output activations; no working memory //Working memory for training: //1 for each example (fwd pass) + output size (1 per ex) + input size + output size... in addition to the returned eps arrays //output size == input size here int trainWorkingSizePerEx = 3 + 2 * inputTypes[0].arrayElementsPerExample(); return new LayerMemoryReport.Builder(null, L2Vertex.class, inputTypes[0], outputType).standardMemory(0, 0) //No params .workingMemory(0, 0, 0, trainWorkingSizePerEx).cacheMemory(0, 0) //No caching .build(); } }
@Override public MemoryReport getMemoryReport(InputType... inputTypes) { InputType outputType = getOutputType(-1, inputTypes); //norm2 value (inference working mem): 1 per example during forward pass //Training working mem: 2 per example + 2x input size + 1 per example (in addition to epsilons) int trainModePerEx = 3 + 2 * inputTypes[0].arrayElementsPerExample(); return new LayerMemoryReport.Builder(null, L2NormalizeVertex.class, inputTypes[0], outputType) .standardMemory(0, 0) //No params .workingMemory(0, 1, 0, trainModePerEx).cacheMemory(0, 0) //No caching .build(); } }
@Override public LayerMemoryReport getMemoryReport(InputType inputType) { InputType outputType = getOutputType(-1, inputType); //TODO CuDNN helper etc int numParams = initializer().numParams(this); int updaterStateSize = 0; for (String s : BatchNormalizationParamInitializer.keys()) { updaterStateSize += getIUpdaterByParam(s).stateSize(nOut); } //During forward pass: working memory size approx. equal to 2x input size (copy ops, etc) int inferenceWorkingSize = 2 * inputType.arrayElementsPerExample(); //During training: we calculate mean and variance... result is equal to nOut, and INDEPENDENT of minibatch size int trainWorkFixed = 2 * nOut; //During backprop: multiple working arrays... output size, 2 * output size (indep. of example size), int trainWorkingSizePerExample = inferenceWorkingSize //Inference during backprop + (outputType.arrayElementsPerExample() + 2 * nOut); //Backprop gradient calculation return new LayerMemoryReport.Builder(layerName, BatchNormalization.class, inputType, outputType) .standardMemory(numParams, updaterStateSize) .workingMemory(0, 0, trainWorkFixed, trainWorkingSizePerExample) //No additional memory (beyond activations) for inference .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching .build(); }
@Override public LayerMemoryReport getMemoryReport(InputType inputType) { //Basically a dense layer... InputType outputType = getOutputType(-1, inputType); int numParams = initializer().numParams(this); int updaterStateSize = (int) getIUpdater().stateSize(numParams); int trainSizeFixed = 0; int trainSizeVariable = 0; if (getDropOut() > 0) { if (false) { //TODO drop connect //Dup the weights... note that this does NOT depend on the minibatch size... trainSizeVariable += 0; //TODO } else { //Assume we dup the input trainSizeVariable += inputType.arrayElementsPerExample(); } } //Also, during backprop: we do a preOut call -> gives us activations size equal to the output size // which is modified in-place by activation function backprop // then we have 'epsilonNext' which is equivalent to input size trainSizeVariable += outputType.arrayElementsPerExample(); return new LayerMemoryReport.Builder(layerName, OutputLayer.class, inputType, outputType) .standardMemory(numParams, updaterStateSize) .workingMemory(0, 0, trainSizeFixed, trainSizeVariable) //No additional memory (beyond activations) for inference .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching .build(); }
@Override public LayerMemoryReport getMemoryReport(InputType inputType) { InputType outputType = getOutputType(-1, inputType); int fwdTrainInferenceWorkingPerEx = 0; //Here: we'll assume we are doing 'full array' global pooling. //For max/avg/sum pooling, no working memory (GlobalPoolingLayer.activateHelperFullArray //But for pnorm, we have working memory if (poolingType == PoolingType.PNORM) { //Dup the input array once before fwdTrainInferenceWorkingPerEx = inputType.arrayElementsPerExample(); } return new LayerMemoryReport.Builder(layerName, GlobalPoolingLayer.class, inputType, outputType) .standardMemory(0, 0) //No params //Train + Inference: no additional working memory (except pnorm) - the reduction is the output activations .workingMemory(0, fwdTrainInferenceWorkingPerEx, 0, fwdTrainInferenceWorkingPerEx) .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching .build(); }
@Override public LayerMemoryReport getMemoryReport(InputType inputType) { InputType outputType = getOutputType(-1, inputType); int numParams = initializer().numParams(this); int updaterStateSize = (int) getIUpdater().stateSize(numParams); int trainSizeFixed = 0; int trainSizeVariable = 0; if (getDropOut() > 0) { if (false) { //TODO drop connect //Dup the weights... note that this does NOT depend on the minibatch size... trainSizeVariable += 0; //TODO } else { //Assume we dup the input trainSizeVariable += inputType.arrayElementsPerExample(); } } //Also, during backprop: we do a preOut call -> gives us activations size equal to the output size // which is modified in-place by activation function backprop // then we have 'epsilonNext' which is equivalent to input size trainSizeVariable += outputType.arrayElementsPerExample(); return new LayerMemoryReport.Builder(layerName, DenseLayer.class, inputType, outputType) .standardMemory(numParams, updaterStateSize) .workingMemory(0, 0, trainSizeFixed, trainSizeVariable) //No additional memory (beyond activations) for inference .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching in DenseLayer .build(); }
trainWorkingSizePerEx += inputType.arrayElementsPerExample();
} else { trainSizeVariable += inputType.arrayElementsPerExample(); trainSizeVariable += outputType.arrayElementsPerExample();
int actElementsPerEx = outputType.arrayElementsPerExample(); int numParams = initializer().numParams(this); int updaterStateSize = (int) getIUpdater().stateSize(numParams); trainWorkingMemSize += inputType.arrayElementsPerExample();
@Override public LayerMemoryReport getMemoryReport(InputType inputType) { //Basically a dense layer, but no dropout is possible here, and no epsilons InputType outputType = getOutputType(-1, inputType); int actElementsPerEx = outputType.arrayElementsPerExample(); int numParams = initializer().numParams(this); int updaterStateSize = (int) getIUpdater().stateSize(numParams); //Embedding layer does not use caching. //Inference: no working memory - just activations (pullRows) //Training: preout op, the only in-place ops on epsilon (from layer above) + assign ops return new LayerMemoryReport.Builder(layerName, EmbeddingLayer.class, inputType, outputType) .standardMemory(numParams, updaterStateSize).workingMemory(0, 0, 0, actElementsPerEx) .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching .build(); }
@Override public LayerMemoryReport getMemoryReport(InputType inputType) { InputType.InputTypeConvolutional c = (InputType.InputTypeConvolutional) inputType; InputType.InputTypeConvolutional outputType = (InputType.InputTypeConvolutional) getOutputType(-1, inputType); int actElementsPerEx = outputType.arrayElementsPerExample(); //TODO Subsampling helper memory use... (CuDNN etc) //During forward pass: im2col array + reduce. Reduce is counted as activations, so only im2col is working mem int im2colSizePerEx = c.getDepth() * outputType.getHeight() * outputType.getWidth() * kernelSize[0] * kernelSize[1]; //Current implementation does NOT cache im2col etc... which means: it's recalculated on each backward pass int trainingWorkingSizePerEx = im2colSizePerEx; if (getDropOut() > 0) { //Dup on the input before dropout, but only for training trainingWorkingSizePerEx += inputType.arrayElementsPerExample(); } return new LayerMemoryReport.Builder(layerName, SubsamplingLayer.class, inputType, outputType) .standardMemory(0, 0) //No params .workingMemory(0, im2colSizePerEx, 0, trainingWorkingSizePerEx) .cacheMemory(MemoryReport.CACHE_MODE_ALL_ZEROS, MemoryReport.CACHE_MODE_ALL_ZEROS) //No caching .build(); }