private NativeOpsHolder() { try { Properties props = Nd4jContext.getInstance().getConf(); String name = System.getProperty(Nd4j.NATIVE_OPS, props.get(Nd4j.NATIVE_OPS).toString()); Class<? extends NativeOps> nativeOpsClazz = Class.forName(name).asSubclass(NativeOps.class); deviceNativeOps = nativeOpsClazz.newInstance(); deviceNativeOps.initializeDevicesAndFunctions(); int numThreads; String numThreadsString = System.getenv("OMP_NUM_THREADS"); if (numThreadsString != null && !numThreadsString.isEmpty()) { numThreads = Integer.parseInt(numThreadsString); deviceNativeOps.setOmpNumThreads(numThreads); } else { int cores = Loader.totalCores(); int chips = Loader.totalChips(); if (chips > 0 && cores > 0) { deviceNativeOps.setOmpNumThreads(Math.max(1, cores / chips)); } else deviceNativeOps.setOmpNumThreads( deviceNativeOps.getCores(Runtime.getRuntime().availableProcessors())); } //deviceNativeOps.setOmpNumThreads(4); log.info("Number of threads used for NativeOps: {}", deviceNativeOps.ompGetMaxThreads()); } catch (Exception | Error e) { throw new RuntimeException( "ND4J is probably missing dependencies. For more information, please refer to: http://nd4j.org/getstarted.html", e); } }
for (int i = 0; i < nativeOps.getAvailableDevices(); i++) { Map<String, Object> deviceProps = new HashMap<>(); deviceProps.put(Nd4jEnvironment.CUDA_DEVICE_NAME_KEY, nativeOps.getDeviceName(devPtr)); deviceProps.put(Nd4jEnvironment.CUDA_FREE_MEMORY_KEY, nativeOps.getDeviceFreeMemory(devPtr)); deviceProps.put(Nd4jEnvironment.CUDA_TOTAL_MEMORY_KEY, nativeOps.getDeviceTotalMemory(devPtr)); deviceProps.put(Nd4jEnvironment.CUDA_DEVICE_MAJOR_VERSION_KEY, (long) nativeOps.getDeviceMajor(devPtr)); deviceProps.put(Nd4jEnvironment.CUDA_DEVICE_MINOR_VERSION_KEY, (long) nativeOps.getDeviceMinor(devPtr)); props.put(Nd4jEnvironment.CUDA_NUM_GPUS_KEY, nativeOps.getAvailableDevices()); props.put(Nd4jEnvironment.CUDA_DEVICE_INFORMATION_KEY, devicesList); props.put(Nd4jEnvironment.BLAS_VENDOR_KEY, Nd4jBlas.Vendor.CUBLAS.toString()); for (int i = 0; i < nativeOps.getAvailableDevices(); i++) { Map<String, Object> dev = devicesList.get(i); CudaPointer devPtr = new CudaPointer(i); dev.put(Nd4jEnvironment.CUDA_FREE_MEMORY_KEY, nativeOps.getDeviceFreeMemory(devPtr)); dev.put(Nd4jEnvironment.CUDA_TOTAL_MEMORY_KEY, nativeOps.getDeviceTotalMemory(devPtr));
public void applyConfiguration() { //log.info("Applying CUDA configuration..."); CudaEnvironment.getInstance().notifyConfigurationApplied(); NativeOpsHolder.getInstance().getDeviceNativeOps().enableDebugMode(configuration.isDebug()); //configuration.enableDebug(configuration.isDebug()); NativeOpsHolder.getInstance().getDeviceNativeOps().enableVerboseMode(configuration.isVerbose()); //configuration.setVerbose(configuration.isVerbose()); NativeOpsHolder.getInstance().getDeviceNativeOps().enableP2P(configuration.isCrossDeviceAccessAllowed()); //configuration.allowCrossDeviceAccess(configuration.isCrossDeviceAccessAllowed()); NativeOpsHolder.getInstance().getDeviceNativeOps().setGridLimit(configuration.getMaximumGridSize()); //configuration.setMaximumGridSize(configuration.getMaximumGridSize()); NativeOpsHolder.getInstance().getDeviceNativeOps().setOmpNumThreads(configuration.getMaximumBlockSize()); // configuration.setMaximumBlockSize(configuration.getMaximumBlockSize()); NativeOpsHolder.getInstance().getDeviceNativeOps().setOmpMinThreads(configuration.getMinimumBlockSize()); // configuration.setMinimumBlockSize(configuration.getMinimumBlockSize()); }
int availableProcessors = Runtime.getRuntime().availableProcessors(); NativeOps nativeOps = NativeOpsHolder.getInstance().getDeviceNativeOps(); int nDevices = nativeOps.getAvailableDevices(); deviceDescription[i] = "Device(" + i + ")"; } else { deviceTotalMem[i] = nativeOps.getDeviceTotalMemory(p); deviceDescription[i] = nativeOps.getDeviceName(p); if (nDevices > 1) { deviceDescription[i] = deviceDescription[i] + " (" + i + ")";
if (CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed() && nativeOps.isP2PAvailable()) { Nd4j.getExecutioner().push(); nativeOps.accumulateDouble(extras, x, (DoublePointer) z, arrays.length, len); } else if (target.data().dataType() == DataBuffer.Type.FLOAT) { nativeOps.accumulateFloat(extras, x, (FloatPointer) z, arrays.length, len); } else { nativeOps.accumulateHalf(extras, x, (ShortPointer) z, arrays.length, len); nativeOps.accumulateDouble(extras, dataPointers, (DoublePointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len); } else if (target.data().dataType() == DataBuffer.Type.FLOAT) { nativeOps.accumulateFloat(extras, dataPointers, (FloatPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len); } else { nativeOps.accumulateHalf(extras, dataPointers, (ShortPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
if (nativeOps.isP2PAvailable() && CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed()) { nativeOps.averageDouble(extras, x, target == null ? null : (DoublePointer) z, arrays.length, len, true); } else if (arrays[0].data().dataType() == DataBuffer.Type.FLOAT) { nativeOps.averageFloat(extras, x, target == null ? null : (FloatPointer) z, arrays.length, len, true); } else { nativeOps.averageHalf(extras, x, target == null ? null : (ShortPointer) z, arrays.length, len, true); nativeOps.averageDouble(extras, dataPointers, target == null ? null : (DoublePointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true); } else if (arrays[0].data().dataType() == DataBuffer.Type.FLOAT) { nativeOps.averageFloat(extras, dataPointers, target == null ? null : (FloatPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true); } else { nativeOps.averageHalf(extras, dataPointers, target == null ? null : (ShortPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true);
long[] gpuMaxBytes = null; NativeOps nativeOps = NativeOpsHolder.getInstance().getDeviceNativeOps(); int nDevices = nativeOps.getAvailableDevices(); if (nDevices > 0) { gpuCurrentBytes = new long[nDevices]; gpuCurrentBytes[i] = 0; } else { gpuMaxBytes[i] = nativeOps.getDeviceTotalMemory(p); gpuCurrentBytes[i] = gpuMaxBytes[i] - nativeOps.getDeviceFreeMemory(p);
public CudaGridExecutioner() { // extraz.set(new PointerPointer(10)); deviceQueues.set(new ArrayDeque<OpDescriptor>()); int numDevices = nativeOps.getAvailableDevices(); for (int x = 0; x < numDevices; x++) { aggregates.add(new ConcurrentLinkedQueue<AggregateDescriptor>()); } experimental.set(nativeOps.isExperimentalEnabled()); }
int numDevices = NativeOpsHolder.getInstance().getDeviceNativeOps().getAvailableDevices(); for (int i = 0; i < numDevices; i++) { deviceAllocations.add(new ConcurrentHashMap<Long, Long>()); if (NativeOpsHolder.getInstance().getDeviceNativeOps().getDeviceMajor(new CudaPointer(0)) < 3) { throw new ND4JIllegalStateException("CUDA backend requires compute capatibility of 3.0 and above to run.");
/** * This method returns number of available devices in system. * * Please note: returned value might be different from actual number of used devices. * * @return total number of devices */ @Override public int getNumberOfDevices() { if (numberOfDevices.get() < 0) { synchronized (this) { if (numberOfDevices.get() < 1) { numberOfDevices.set(NativeOpsHolder.getInstance().getDeviceNativeOps().getAvailableDevices()); } } } return numberOfDevices.get(); }
long freeMem = nativeOps.getDeviceFreeMemory(new CudaPointer(-1)); if (freeMem - requiredMemory < DEVICE_RESERVED_SPACE) return false;
for (int i = 0; i < nativeOps.getAvailableDevices(); i++) { Map<String, Object> deviceProps = new HashMap<>(); deviceProps.put(Nd4jEnvironment.CUDA_DEVICE_NAME_KEY, nativeOps.getDeviceName(devPtr)); deviceProps.put(Nd4jEnvironment.CUDA_FREE_MEMORY_KEY, nativeOps.getDeviceFreeMemory(devPtr)); deviceProps.put(Nd4jEnvironment.CUDA_TOTAL_MEMORY_KEY, nativeOps.getDeviceTotalMemory(devPtr)); deviceProps.put(Nd4jEnvironment.CUDA_DEVICE_MAJOR_VERSION_KEY, (long) nativeOps.getDeviceMajor(devPtr)); deviceProps.put(Nd4jEnvironment.CUDA_DEVICE_MINOR_VERSION_KEY, (long) nativeOps.getDeviceMinor(devPtr)); props.put(Nd4jEnvironment.CUDA_NUM_GPUS_KEY, nativeOps.getAvailableDevices()); props.put(Nd4jEnvironment.CUDA_DEVICE_INFORMATION_KEY, devicesList); props.put(Nd4jEnvironment.BLAS_VENDOR_KEY, (Nd4j.factory().blas()).getBlasVendor().toString()); for (int i = 0; i < nativeOps.getAvailableDevices(); i++) { Map<String, Object> dev = devicesList.get(i); CudaPointer devPtr = new CudaPointer(i); dev.put(Nd4jEnvironment.CUDA_FREE_MEMORY_KEY, nativeOps.getDeviceFreeMemory(devPtr)); dev.put(Nd4jEnvironment.CUDA_TOTAL_MEMORY_KEY, nativeOps.getDeviceTotalMemory(devPtr));
public void applyConfiguration() { //log.info("Applying CUDA configuration..."); CudaEnvironment.getInstance().notifyConfigurationApplied(); NativeOpsHolder.getInstance().getDeviceNativeOps().enableDebugMode(configuration.isDebug()); //configuration.enableDebug(configuration.isDebug()); NativeOpsHolder.getInstance().getDeviceNativeOps().enableVerboseMode(configuration.isVerbose()); //configuration.setVerbose(configuration.isVerbose()); NativeOpsHolder.getInstance().getDeviceNativeOps().enableP2P(configuration.isCrossDeviceAccessAllowed()); //configuration.allowCrossDeviceAccess(configuration.isCrossDeviceAccessAllowed()); NativeOpsHolder.getInstance().getDeviceNativeOps().setGridLimit(configuration.getMaximumGridSize()); //configuration.setMaximumGridSize(configuration.getMaximumGridSize()); NativeOpsHolder.getInstance().getDeviceNativeOps().setOmpNumThreads(configuration.getMaximumBlockSize()); // configuration.setMaximumBlockSize(configuration.getMaximumBlockSize()); NativeOpsHolder.getInstance().getDeviceNativeOps().setOmpMinThreads(configuration.getMinimumBlockSize()); // configuration.setMinimumBlockSize(configuration.getMinimumBlockSize()); }
if (CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed() && nativeOps.isP2PAvailable()) { Nd4j.getExecutioner().push(); nativeOps.accumulateDouble(extras, x, (DoublePointer) z, arrays.length, len); } else if (target.data().dataType() == DataBuffer.Type.FLOAT) { nativeOps.accumulateFloat(extras, x, (FloatPointer) z, arrays.length, len); } else { nativeOps.accumulateHalf(extras, x, (ShortPointer) z, arrays.length, len); nativeOps.accumulateDouble(extras, dataPointers, (DoublePointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len); } else if (target.data().dataType() == DataBuffer.Type.FLOAT) { nativeOps.accumulateFloat(extras, dataPointers, (FloatPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len); } else { nativeOps.accumulateHalf(extras, dataPointers, (ShortPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len);
if (nativeOps.isP2PAvailable() && CudaEnvironment.getInstance().getConfiguration().isCrossDeviceAccessAllowed()) { nativeOps.averageDouble(extras, x, target == null ? null : (DoublePointer) z, arrays.length, len, true); } else if (arrays[0].data().dataType() == DataBuffer.Type.FLOAT) { nativeOps.averageFloat(extras, x, target == null ? null : (FloatPointer) z, arrays.length, len, true); } else { nativeOps.averageHalf(extras, x, target == null ? null : (ShortPointer) z, arrays.length, len, true); nativeOps.averageDouble(extras, dataPointers, target == null ? null : (DoublePointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true); } else if (arrays[0].data().dataType() == DataBuffer.Type.FLOAT) { nativeOps.averageFloat(extras, dataPointers, target == null ? null : (FloatPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true); } else { nativeOps.averageHalf(extras, dataPointers, target == null ? null : (ShortPointer) AtomicAllocator.getInstance().getHostPointer(target), arrays.length, len, true);
public CudaGridExecutioner() { // extraz.set(new PointerPointer(10)); deviceQueues.set(new ArrayDeque<OpDescriptor>()); int numDevices = nativeOps.getAvailableDevices(); for (int x = 0; x < numDevices; x++) { aggregates.add(new ConcurrentLinkedQueue<AggregateDescriptor>()); } experimental.set(nativeOps.isExperimentalEnabled()); }
int numDevices = NativeOpsHolder.getInstance().getDeviceNativeOps().getAvailableDevices(); for (int i = 0; i < numDevices; i++) { deviceAllocations.add(new ConcurrentHashMap<Long, Long>()); if (NativeOpsHolder.getInstance().getDeviceNativeOps().getDeviceMajor(new CudaPointer(0)) < 3) { throw new ND4JIllegalStateException("CUDA backend requires compute capatibility of 3.0 and above to run.");
/** * This method returns number of available devices in system. * * Please note: returned value might be different from actual number of used devices. * * @return total number of devices */ @Override public int getNumberOfDevices() { if (numberOfDevices.get() < 0) { synchronized (this) { if (numberOfDevices.get() < 1) { numberOfDevices.set(NativeOpsHolder.getInstance().getDeviceNativeOps().getAvailableDevices()); } } } return numberOfDevices.get(); }
long freeMem = nativeOps.getDeviceFreeMemory(new CudaPointer(-1)); if (freeMem - requiredMemory < DEVICE_RESERVED_SPACE) return false;
private NativeOpsHolder() { try { Properties props = Nd4jContext.getInstance().getConf(); String name = System.getProperty(Nd4j.NATIVE_OPS, props.get(Nd4j.NATIVE_OPS).toString()); Class<? extends NativeOps> nativeOpsClazz = Class.forName(name).asSubclass(NativeOps.class); deviceNativeOps = nativeOpsClazz.newInstance(); deviceNativeOps.initializeDevicesAndFunctions(); int numThreads; String numThreadsString = System.getenv("OMP_NUM_THREADS"); if (numThreadsString != null && !numThreadsString.isEmpty()) { numThreads = Integer.parseInt(numThreadsString); deviceNativeOps.setOmpNumThreads(numThreads); } else { int cores = Loader.totalCores(); int chips = Loader.totalChips(); if (chips > 0 && cores > 0) { deviceNativeOps.setOmpNumThreads(Math.max(1, cores / chips)); } else deviceNativeOps.setOmpNumThreads( deviceNativeOps.getCores(Runtime.getRuntime().availableProcessors())); } //deviceNativeOps.setOmpNumThreads(4); log.info("Number of threads used for NativeOps: {}", deviceNativeOps.ompGetMaxThreads()); } catch (Exception | Error e) { throw new RuntimeException( "ND4J is probably missing dependencies. For more information, please refer to: http://nd4j.org/getstarted.html", e); } }