.allowMultiGPU(true) .setMaximumDeviceCache(2L * 1024L * 1024L * 1024L) .allowCrossDeviceAccess(true);
private CudaEnvironment() { configuration = new Configuration(); }
try { int var = Integer.parseInt(System.getenv(ND4JEnvironmentVars.ND4J_CUDA_MAX_BLOCK_SIZE)); setMaximumBlockSize(var); } catch (Exception e) { log.error("Can't parse {}: [{}]", ND4JEnvironmentVars.ND4J_CUDA_MAX_BLOCK_SIZE, System.getenv(ND4JEnvironmentVars.ND4J_CUDA_MAX_BLOCK_SIZE)); try { int var = Integer.parseInt(System.getenv(ND4JEnvironmentVars.ND4J_CUDA_MIN_BLOCK_SIZE)); setMinimumBlockSize(var); } catch (Exception e) { log.error("Can't parse {}: [{}]", ND4JEnvironmentVars.ND4J_CUDA_MIN_BLOCK_SIZE, System.getenv(ND4JEnvironmentVars.ND4J_CUDA_MIN_BLOCK_SIZE)); try { int var = Integer.parseInt(System.getenv(ND4JEnvironmentVars.ND4J_CUDA_MAX_GRID_SIZE)); setMaximumGridSize(var); } catch (Exception e) { log.error("Can't parse {}: [{}]", ND4JEnvironmentVars.ND4J_CUDA_MAX_GRID_SIZE, System.getenv(ND4JEnvironmentVars.ND4J_CUDA_MAX_GRID_SIZE)); try { boolean var = Boolean.parseBoolean(System.getenv(ND4JEnvironmentVars.ND4J_CUDA_FORCE_SINGLE_GPU)); allowMultiGPU(!var); } catch (Exception e) { log.error("Can't parse {}: [{}]", ND4JEnvironmentVars.ND4J_CUDA_FORCE_SINGLE_GPU, System.getenv(ND4JEnvironmentVars.ND4J_CUDA_FORCE_SINGLE_GPU)); try { boolean var = Boolean.parseBoolean(System.getenv(ND4JEnvironmentVars.ND4J_CUDA_USE_PREALLOCATION)); allowPreallocation(var); } catch (Exception e) { log.error("Can't parse {}: [{}]", ND4JEnvironmentVars.ND4J_CUDA_USE_PREALLOCATION, System.getenv(ND4JEnvironmentVars.ND4J_CUDA_USE_PREALLOCATION));
public void applyConfiguration() { //log.info("Applying CUDA configuration..."); CudaEnvironment.getInstance().notifyConfigurationApplied(); NativeOpsHolder.getInstance().getDeviceNativeOps().enableDebugMode(configuration.isDebug()); //configuration.enableDebug(configuration.isDebug()); NativeOpsHolder.getInstance().getDeviceNativeOps().enableVerboseMode(configuration.isVerbose()); //configuration.setVerbose(configuration.isVerbose()); NativeOpsHolder.getInstance().getDeviceNativeOps().enableP2P(configuration.isCrossDeviceAccessAllowed()); //configuration.allowCrossDeviceAccess(configuration.isCrossDeviceAccessAllowed()); NativeOpsHolder.getInstance().getDeviceNativeOps().setGridLimit(configuration.getMaximumGridSize()); //configuration.setMaximumGridSize(configuration.getMaximumGridSize()); NativeOpsHolder.getInstance().getDeviceNativeOps().setOmpNumThreads(configuration.getMaximumBlockSize()); // configuration.setMaximumBlockSize(configuration.getMaximumBlockSize()); NativeOpsHolder.getInstance().getDeviceNativeOps().setOmpMinThreads(configuration.getMinimumBlockSize()); // configuration.setMinimumBlockSize(configuration.getMinimumBlockSize()); }
if (CudaEnvironment.getInstance().getConfiguration().isDebug()) context.syncOldStream();
public CudaZeroHandler() { configuration.setInitialized(); this.INITIAL_LOCATION = configuration.getFirstMemory(); switch (configuration.getExecutionModel()) { case OPTIMIZED: case ASYNCHRONOUS: { throw new RuntimeException("Unknown ExecutionModel: [" + configuration.getExecutionModel() + "]"); switch (configuration.getAllocationModel()) { case CACHE_ALL: this.memoryProvider = new CudaFullCachingProvider(); break; default: throw new RuntimeException("Unknown AllocationModel: [" + configuration.getAllocationModel() + "]");
/** * This method returns set of available devices * @return */ @Override public Set<Integer> getAvailableDevices() { return new HashSet<>(configuration.getAvailableDevices()); }
/** * This method returns device id available. Round-robin balancing used here. * * @param threadId this parameter can be anything, it's used for logging only. * @return */ protected Integer getNextDevice(long threadId) { Integer device = null; if (!CudaEnvironment.getInstance().getConfiguration().isForcedSingleGPU() && getNumberOfDevices() > 0) { // simple round-robin here synchronized (this) { device = CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().get(devPtr.getAndIncrement()); // We check only for number of entries here, not their actual values if (devPtr.get() >= CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().size()) devPtr.set(0); logger.debug("Mapping thread [{}] to device [{}], out of [{}] devices...", threadId, device, CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().size()); } } else { device = CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().get(0); logger.debug("Single device is forced, mapping to device [{}]", device); } return device; }
extraz.set(new PointerPointer(32)); if (CudaEnvironment.getInstance().getConfiguration().isDebug()) lastOp.set(op.name()); CudaEnvironment.getInstance().getConfiguration().enableDebug(true); for (int i = 0; i < dimension.length; i++) if (dimension[i] >= op.x().rank() && dimension[i] != Integer.MAX_VALUE)
public AsynchronousFlowController() { int numLanes = configuration.getCommandLanesNumber(); int numDevices = Nd4j.getAffinityManager().getNumberOfDevices(); for (int d = 0; d < numDevices; d++) { eventsBarrier.add(d, new ArrayList<Queue<cudaEvent_t>>()); laneClocks.add(d, new ArrayList<AtomicLong>()); deviceClocks.add(d, new AtomicLong(0)); for (int l = 0; l < numLanes; l++) { eventsBarrier.get(d).add(l, new ConcurrentLinkedQueue<cudaEvent_t>()); laneClocks.get(d).add(l, new AtomicLong(0)); } } }
@Override public void allowCrossDeviceAccess(boolean reallyAllow) { CudaEnvironment.getInstance().getConfiguration().allowCrossDeviceAccess(reallyAllow); } }
try { int var = Integer.parseInt(env.get(MAX_BLOCK_SIZE)); setMaximumBlockSize(var); } catch (Exception e) { log.error("Can't parse {}: [{}]", MAX_BLOCK_SIZE, env.get(MAX_BLOCK_SIZE)); try { int var = Integer.parseInt(env.get(MIN_BLOCK_SIZE)); setMinimumBlockSize(var); } catch (Exception e) { log.error("Can't parse {}: [{}]", MIN_BLOCK_SIZE, env.get(MIN_BLOCK_SIZE)); try { int var = Integer.parseInt(env.get(MAX_GRID_SIZE)); setMaximumGridSize(var); } catch (Exception e) { log.error("Can't parse {}: [{}]", MAX_GRID_SIZE, env.get(MAX_GRID_SIZE)); try { boolean var = Boolean.parseBoolean(env.get(DEBUG_ENABLED)); enableDebug(var); } catch (Exception e) { log.error("Can't parse {}: [{}]", DEBUG_ENABLED, env.get(DEBUG_ENABLED)); try { boolean var = Boolean.parseBoolean(env.get(FORCE_SINGLE_GPU)); allowMultiGPU(!var); } catch (Exception e) { log.error("Can't parse {}: [{}]", FORCE_SINGLE_GPU, env.get(FORCE_SINGLE_GPU));
public void applyConfiguration() { //log.info("Applying CUDA configuration..."); CudaEnvironment.getInstance().notifyConfigurationApplied(); NativeOpsHolder.getInstance().getDeviceNativeOps().enableDebugMode(configuration.isDebug()); //configuration.enableDebug(configuration.isDebug()); NativeOpsHolder.getInstance().getDeviceNativeOps().enableVerboseMode(configuration.isVerbose()); //configuration.setVerbose(configuration.isVerbose()); NativeOpsHolder.getInstance().getDeviceNativeOps().enableP2P(configuration.isCrossDeviceAccessAllowed()); //configuration.allowCrossDeviceAccess(configuration.isCrossDeviceAccessAllowed()); NativeOpsHolder.getInstance().getDeviceNativeOps().setGridLimit(configuration.getMaximumGridSize()); //configuration.setMaximumGridSize(configuration.getMaximumGridSize()); NativeOpsHolder.getInstance().getDeviceNativeOps().setOmpNumThreads(configuration.getMaximumBlockSize()); // configuration.setMaximumBlockSize(configuration.getMaximumBlockSize()); NativeOpsHolder.getInstance().getDeviceNativeOps().setOmpMinThreads(configuration.getMinimumBlockSize()); // configuration.setMinimumBlockSize(configuration.getMinimumBlockSize()); }
@Override public void backprop(INDArray gradAtOutput, INDArray gradAtInput) { int[] gradAtOutShape = adaptForTensorDescr(ArrayUtil.toInts(gradAtOutput.shape())); int[] gradAtOutStride = adaptForTensorDescr(ArrayUtil.toInts(gradAtOutput.stride())); checkCudnn(cudnnSetTensorNdDescriptor(cudnnContext.dyTensorDesc, dataType, gradAtOutShape.length, gradAtOutShape, gradAtOutStride)); int[] gradAtInShape = adaptForTensorDescr(ArrayUtil.toInts(gradAtInput.shape())); int[] gradAtInStride = adaptForTensorDescr(ArrayUtil.toInts(gradAtInput.stride())); checkCudnn(cudnnSetTensorNdDescriptor(cudnnContext.dxTensorDesc, dataType, gradAtInShape.length, gradAtInShape, gradAtInStride)); Allocator allocator = AtomicAllocator.getInstance(); CudaContext context = allocator.getFlowController().prepareAction(gradAtOutput, gradAtInput); Pointer dyPtr = allocator.getPointer(gradAtOutput, context); Pointer dxPtr = allocator.getPointer(gradAtInput, context); checkCudnn(cudnnDropoutBackward(cudnnContext, cudnnContext.dropoutDesc, cudnnContext.dyTensorDesc, dyPtr, cudnnContext.dxTensorDesc, dxPtr, mask, mask.capacity())); allocator.registerAction(context, gradAtOutput, gradAtInput); if (CudaEnvironment.getInstance().getConfiguration().isDebug()) context.syncOldStream(); } }
public CudaZeroHandler() { configuration.setInitialized(); this.INITIAL_LOCATION = configuration.getFirstMemory(); switch (configuration.getExecutionModel()) { case OPTIMIZED: case ASYNCHRONOUS: { throw new RuntimeException("Unknown ExecutionModel: [" + configuration.getExecutionModel() + "]"); switch (configuration.getAllocationModel()) { case CACHE_ALL: this.memoryProvider = new CudaFullCachingProvider(); break; default: throw new RuntimeException("Unknown AllocationModel: [" + configuration.getAllocationModel() + "]");
/** * This method returns set of available devices * @return */ @Override public Set<Integer> getAvailableDevices() { return new HashSet<>(configuration.getAvailableDevices()); }
/** * This method returns device id available. Round-robin balancing used here. * * @param threadId this parameter can be anything, it's used for logging only. * @return */ protected Integer getNextDevice(long threadId) { Integer device = null; if (!CudaEnvironment.getInstance().getConfiguration().isForcedSingleGPU() && getNumberOfDevices() > 0) { // simple round-robin here synchronized (this) { device = CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().get(devPtr.getAndIncrement()); // We check only for number of entries here, not their actual values if (devPtr.get() >= CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().size()) devPtr.set(0); logger.debug("Mapping thread [{}] to device [{}], out of [{}] devices...", threadId, device, CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().size()); } } else { device = CudaEnvironment.getInstance().getConfiguration().getAvailableDevices().get(0); logger.debug("Single device is forced, mapping to device [{}]", device); } return device; }
extraz.set(new PointerPointer(32)); if (CudaEnvironment.getInstance().getConfiguration().isDebug()) lastOp.set(op.opName()); CudaEnvironment.getInstance().getConfiguration().enableDebug(true); for (int i = 0; i < dimension.length; i++) if (dimension[i] >= op.x().rank() && dimension[i] != Integer.MAX_VALUE)
public AsynchronousFlowController() { int numLanes = configuration.getCommandLanesNumber(); int numDevices = Nd4j.getAffinityManager().getNumberOfDevices(); for (int d = 0; d < numDevices; d++) { eventsBarrier.add(d, new ArrayList<Queue<cudaEvent_t>>()); laneClocks.add(d, new ArrayList<AtomicLong>()); deviceClocks.add(d, new AtomicLong(0)); for (int l = 0; l < numLanes; l++) { eventsBarrier.get(d).add(l, new ConcurrentLinkedQueue<cudaEvent_t>()); laneClocks.get(d).add(l, new AtomicLong(0)); } } }
@Override public void allowCrossDeviceAccess(boolean reallyAllow) { CudaEnvironment.getInstance().getConfiguration().allowCrossDeviceAccess(reallyAllow); } }