com.sleepycat.je.EnvironmentStats java code examples

@JmxGetter(name = "AllotedCacheSize")
public long getAllotedCacheSize() {
  return getFastStats().getCacheTotalBytes();
}

@JmxGetter(name = "NumCacheMiss")
public long getNumCacheMiss() {
  return getFastStats().getNCacheMiss();
}

@JmxGetter(name = "FastStatsAsString")
public String getFastStatsAsString() {
  return getFastStats().toString();
}

    "MainCache= " + INT_FORMAT.format(stats.getCacheTotalBytes()) +
    " Data= " + INT_FORMAT.format(dataSize)  +
    " BINs= " + INT_FORMAT.format(stats.getNCachedBINs()) +
    " UINs= " + INT_FORMAT.format(stats.getNCachedUpperINs()) +
    " CacheMiss= " + INT_FORMAT.format(stats.getNCacheMiss()) +
    " OffHeapCache= " +
    INT_FORMAT.format(stats.getOffHeapTotalBytes()) +
    " OhLNs= " + INT_FORMAT.format(stats.getOffHeapCachedLNs()) +
    " OhBIN= " + INT_FORMAT.format(stats.getOffHeapCachedBINs()) +
    " OhBINDeltas= " +
    INT_FORMAT.format(stats.getOffHeapCachedBINDeltas()));
if (stats.getNNodesTargeted() > 0) {
  throw new IllegalStateException(
    "*** All records did not fit in the cache ***");
if (stats.getOffHeapNodesTargeted() > 0) {
  throw new IllegalStateException(
    "*** All records did not fit in the off-heap cache ***");

/**
 * Retrieve and return stat information.
 */
public EnvironmentStats loadStatsInternal(StatsConfig config)
  throws DatabaseException {
  EnvironmentStats stats = new EnvironmentStats();
  synchronized (statSynchronizer) {
    stats.setINCompStats(inCompressor.loadStats(config));
    stats.setCkptStats(checkpointer.loadStats(config));
    stats.setCleanerStats(cleaner.loadStats(config));
    stats.setLogStats(logManager.loadStats(config));
    stats.setMBAndEvictorStats(
      memoryBudget.loadStats(), evictor.loadStats(config));
    stats.setOffHeapStats(offHeapCache.loadStats(config));
    stats.setLockStats(txnManager.loadStats(config));
    stats.setEnvStats(envStats);
    stats.setBackupStats(backupStats);
    stats.setBtreeOpStats(btreeOpStats.cloneGroup(config.getClear()));
    stats.setThroughputStats(
      thrputStats.cloneGroup(config.getClear()));
    stats.setTaskCoordinatorStats(taskCoordinator.getStats(config));
    stats.setEraserStats(dataEraser.loadStats(config));
  }
  return stats;
}

EnvironmentStats statsB = environmentB.getStats(new StatsConfig());
long totalCacheSize = statsA.getCacheTotalBytes() + statsB.getCacheTotalBytes();
System.out.println("A.size:" + statsA.getCacheTotalBytes() + " B.size:"
          + statsB.getCacheTotalBytes() + " total:" + totalCacheSize + " max:"
          + maxCacheUsage + " cacheMax:"
          + environmentA.getConfig().getCacheSize());
System.out.println("Shared.A:" + statsA.getSharedCacheTotalBytes() + " nSharedEnv:"
          + statsA.getNSharedCacheEnvironments());
maxCacheUsage = Math.max(maxCacheUsage, totalCacheSize);

/**
 * Returns a String representation of the stats which includes stats
 * descriptions in addition to &lt;stat&gt;=&lt;value&gt;
 */
public String toStringVerbose() {
  StringBuilder sb = new StringBuilder();
  for (StatGroup group : getStatGroups()) {
    sb.append(group.toStringVerbose());
  }
  return sb.toString();
}

/**
 * @deprecated Please use {@link #getDataBytes} to get the amount of cache
 * used for data and use {@link #getAdminBytes}, {@link #getLockBytes} and
 * {@link #getBufferBytes} to get other components of the total cache usage
 * ({@link #getCacheTotalBytes}).
 */
public long getCacheDataBytes() {
  return getCacheTotalBytes() - getBufferBytes();
}

  return null;
} else if (actionName.equals(OP_ENV_STAT)) {
  return env.getStats(getStatsConfig(params)).toString();
} else if (actionName.equals(OP_TXN_STAT)) {
  return env.getTransactionStats
} else if (actionName.equals(OP_GET_TIPS)) {
  return env.getStats
    (getStatsConfig(new Object[] {false, true})).getTips();
} else if (actionName.equals(OP_ENV_CONFIG)) {
  return env.getConfig().toString();

/**
 * @hidden
 * Internal use only
 * For JConsole plugin support.
 */
public static String[] getStatGroupTitles() {
  List<StatGroup> groups = new EnvironmentStats().getStatGroups();
  final String[] titles = new String[groups.size()];
  for (int i = 0; i < titles.length; i += 1) {
    titles[i] = groups.get(i).getName();
  }
  return titles;
}

@JmxGetter(name = "TotalCacheSize")
public long getTotalCacheSize() {
  return getFastStats().getSharedCacheTotalBytes();
}

@JmxGetter(name = "CachedBINs")
public long getCachedBINs() {
  return getFastStats().getNCachedBINs();
}

@JmxGetter(name = "CachedINs")
public long getCachedUpperINs() {
  return getFastStats().getNCachedUpperINs();
}

private EnvironmentStats computeIntervalStats(
  Map<String, StatGroup> current,
  Map<String, StatGroup> base) {
  EnvironmentStats envStats = new EnvironmentStats();
  for (StatGroup cg : current.values()) {
    StatGroup bg = base.get(cg.getName());
    envStats.setStatGroup(cg.computeInterval(bg));
  }
  return envStats;
}

/**
 * @hidden
 * Internal use only.
 */
public Map<String, StatGroup> getStatGroupsMap() {
  final HashMap<String, StatGroup> map = new HashMap<>();
  for (StatGroup group : getStatGroups()) {
    map.put(group.getName(), group);
  }
  return map;
}

       getStats(bdbStorage.getEnvironment(defA)).getSharedCacheTotalBytes());
assertEquals("Store B has non zero shared cache",
       0,
       getStats(bdbStorage.getEnvironment(defB)).getSharedCacheTotalBytes());

private long getCacheSize(StoreDefinition storeDef) {
  return getStats(bdbStorage.getEnvironment(storeDef)).getCacheTotalBytes();
}

public String getStats(String storeName, boolean fast) {
  try {
    if(environments.containsKey(storeName)) {
      StatsConfig config = new StatsConfig();
      config.setFast(fast);
      Environment env = environments.get(storeName);
      return env.getStats(config).toString();
    } else {
      // return empty string if environment not created yet
      return "";
    }
  } catch(DatabaseException e) {
    throw new VoldemortException(e);
  }
}

public synchronized long getCacheMisses() {
  if(alreadySeen==null) {
    return 0;
  }
  try {
    long cacheMiss = this.alreadySeen.getEnvironment().
    getStats(null).getNCacheMiss();
    // FIXME: get shouldn't define intervals (should be idempotent)
    this.lastCacheMissDiff = cacheMiss - this.lastCacheMiss;
    this.lastCacheMiss = cacheMiss;
    return this.lastCacheMiss;
  } catch (DatabaseException de) {
    return 0;
  }
  
}

/**
 * Returns a String representation of the stats in the form of
 * &lt;stat&gt;=&lt;value&gt;
 */
@Override
public String toString() {
  StringBuilder sb = new StringBuilder();
  for (StatGroup group : getStatGroups()) {
    sb.append(group.toString());
  }
  return sb.toString();
}

Javadoc

Statistics for a single environment. Statistics provide indicators for system monitoring and performance tuning.

Each statistic has a name and a getter method in this class. For example, the cacheTotalBytes stat is returned by the #getCacheTotalBytes() method. Statistics are categorized into several groups, for example, cacheTotalBytes is in the Cachegroup. Each stat and group has a name and a description.

Viewing the statistics through #toString() shows the stat names and values organized by group. Viewing the stats with #toStringVerbose() additionally shows the description of each stat and group.

Statistics are periodically output in CSV format to the je.stat.csv file (see EnvironmentConfig#STATS_COLLECT). The column header in the .csv file has group:stat format, where 'group' is the group name and 'stat' is the stat name. In Oracle NoSQL DB, in the addition to the .csv file, JE stats are output in the .stat files.

Stat values may also be obtained via JMX using the JEMonitor mbean. In Oracle NoSQL DB, JE stats are obtained via a different JMX interface in JSON format. The JSON format uses property names of the form group_stat where 'group' is the group name and 'stat' is the stat name.

The stat groups are listed below. Each group name links to a summary of the statistics in the group.

Group Name	Description
com.sleepycat.je.evictor.EvictorStatDefinition#GROUP_NAME	com.sleepycat.je.evictor.EvictorStatDefinition#GROUP_DESC
com.sleepycat.je.evictor.OffHeapStatDefinition#GROUP_NAME	com.sleepycat.je.evictor.OffHeapStatDefinition#GROUP_DESC
com.sleepycat.je.cleaner.CleanerStatDefinition#GROUP_NAME	com.sleepycat.je.cleaner.CleanerStatDefinition#GROUP_DESC
com.sleepycat.je.log.LogStatDefinition#GROUP_NAME	com.sleepycat.je.log.LogStatDefinition#GROUP_DESC
com.sleepycat.je.incomp.INCompStatDefinition#GROUP_NAME	com.sleepycat.je.incomp.INCompStatDefinition#GROUP_DESC
com.sleepycat.je.recovery.CheckpointStatDefinition#GROUP_NAME	com.sleepycat.je.recovery.CheckpointStatDefinition#GROUP_DESC
com.sleepycat.je.txn.LockStatDefinition#GROUP_NAME	com.sleepycat.je.txn.LockStatDefinition#GROUP_DESC
com.sleepycat.je.dbi.DbiStatDefinition#THROUGHPUT_GROUP_NAME	com.sleepycat.je.dbi.DbiStatDefinition#THROUGHPUT_GROUP_DESC
com.sleepycat.je.dbi.BTreeStatDefinition#BT_OP_GROUP_NAME	com.sleepycat.je.dbi.BTreeStatDefinition#BT_OP_GROUP_DESC
com.sleepycat.je.dbi.DbiStatDefinition#ENV_GROUP_NAME	com.sleepycat.je.dbi.DbiStatDefinition#ENV_GROUP_DESC

The following sections describe each group of stats along with some common strategies for using them for monitoring and performance tuning.

Cache Statistics

Group Name: com.sleepycat.je.evictor.EvictorStatDefinition#GROUP_NAME
Description: com.sleepycat.je.evictor.EvictorStatDefinition#GROUP_DESC

Group Name: com.sleepycat.je.evictor.OffHeapStatDefinition#GROUP_NAME
Description: com.sleepycat.je.evictor.OffHeapStatDefinition#GROUP_DESC

The JE cache consists of the main (in-heap) cache and and optional off-heap cache. The vast majority of the cache is occupied by Btree nodes, including internal nodes (INs) and leaf nodes (LNs). INs contain record keys while each LN contain a single record's key and data.

Each IN refers to a configured maximum number of child nodes ( EnvironmentConfig#NODE_MAX_ENTRIES). The INs form a Btree of at least 2 levels. With a large data set the Btree will normally have 4 or 5 levels. The top level is a single node, the root IN. Levels are numbered from the bottom up, starting with level 1 for bottom level INs (BINs). Levels are added at the top when the root IN splits.

When an off-heap cache is configured, it serves as an overflow for the main cache. See EnvironmentConfig#MAX_OFF_HEAP_MEMORY.

Cache Statistics: Sizing

Operation performance is often directly proportional to how much of the active data set is cached. BINs and LNs form the vast majority of the cache. Caching of BINs and LNs has different performance impacts, and behavior varies depending on whether an off-heap cache is configured and which CacheMode is used.

Main cache current usage is indicated by the following stats. Note that there is currently no stat for the number of LNs in the main cache.

#getCacheTotalBytes	com.sleepycat.je.dbi.DbiStatDefinition#MB_TOTAL_BYTES_DESC
#getNCachedBINs	com.sleepycat.je.evictor.EvictorStatDefinition#CACHED_BINS_DESC
#getNCachedBINDeltas	com.sleepycat.je.evictor.EvictorStatDefinition#CACHED_BIN_DELTAS_DESC
#getNCachedUpperINs	com.sleepycat.je.evictor.EvictorStatDefinition#CACHED_UPPER_INS_DESC

Off-heap cache current usage is indicated by:

#getOffHeapTotalBytes	com.sleepycat.je.evictor.OffHeapStatDefinition#TOTAL_BYTES_NAME
#getOffHeapCachedLNs	com.sleepycat.je.evictor.OffHeapStatDefinition#CACHED_LNS_DESC
#getOffHeapCachedBINs	com.sleepycat.je.evictor.OffHeapStatDefinition#CACHED_BINS_DESC
#getOffHeapCachedBINDeltas	com.sleepycat.je.evictor.OffHeapStatDefinition#CACHED_BIN_DELTAS_DESC

A cache miss is considered a miss only when the object is not found in either cache. Misses often result in file I/O and are a primary indicator of cache performance. Fetches (access requests) and misses are indicated by:

#getNLNsFetch	com.sleepycat.je.evictor.EvictorStatDefinition#LN_FETCH_DESC
#getNLNsFetchMiss	com.sleepycat.je.evictor.EvictorStatDefinition#LN_FETCH_MISS_DESC
#getNBINsFetch	com.sleepycat.je.evictor.EvictorStatDefinition#BIN_FETCH_DESC
#getNBINsFetchMiss	com.sleepycat.je.evictor.EvictorStatDefinition#BIN_FETCH_MISS_DESC
#getNBINDeltasFetchMiss	com.sleepycat.je.evictor.EvictorStatDefinition#BIN_DELTA_FETCH_MISS_DESC
#getNFullBINsMiss	com.sleepycat.je.evictor.EvictorStatDefinition#FULL_BIN_MISS_DESC
#getNUpperINsFetch	com.sleepycat.je.evictor.EvictorStatDefinition#UPPER_IN_FETCH_DESC
#getNUpperINsFetchMiss	com.sleepycat.je.evictor.EvictorStatDefinition#UPPER_IN_FETCH_MISS_DESC

When the number of LN misses ( nLNsFetchMiss) or the number of BIN misses ( nBINsFetchMiss + nFullBINsMiss) are significant, the JE cache may be undersized, as discussed below. But note that it is not practical to correlate the number of fetches and misses directly to application operations, because LNs are sometimes EnvironmentConfig#TREE_MAX_EMBEDDED_LN, BINs are sometimes accessed multiple times per operation, and internal Btree accesses are included in the stat values.

Ideally, all BINs and LNs for the active data set should fit in cache so that operations do not result in fetch misses, which often perform random read I/O. When this is not practical, which is often the case for large data sets, the next best thing is to ensure that all BINs fit in cache, so that an operation will perform at most one random read I/O to fetch the LN. The DbCacheSize javadoc describes how to size the cache to ensure that all BINs and/or LNs fit in cache.

Normally EnvironmentConfig#MAX_MEMORY_PERCENT determines the JE cache size as a value relative to the JVM heap size, i.e., the heap size determines the cache size.

For configuring cache size and behavior, see:

EnvironmentConfig#MAX_MEMORY_PERCENT
EnvironmentConfig#MAX_MEMORY
EnvironmentConfig#MAX_OFF_HEAP_MEMORY
EnvironmentConfig#setCacheMode(CacheMode)
CacheMode
DbCacheSize

When using Oracle NoSQL DB, a sizing exercise and DbCacheSize are used to determine the cache size needed to hold all BINs in memory. The memory available to each node is divided between a 32 GB heap for the JVM process (so that CompressedOops may be used) and the off-heap cache (when more than 32 GB of memory is available).

It is also important not to configured the cache size too large, relative to the JVM heap size. If there is not enough free space in the heap, Java GC pauses may become a problem. Increasing the default value for MAX_MEMORY_PERCENT, or setting MAX_MEMORY (which overrides MAX_MEMORY_PERCENT), should be done carefully.

Java GC performance may also be improved by using CacheMode#EVICT_LN. Record data sizes should also be kept below 1 MB to avoid "humongous objects" (see Java GC documentation).

When using Oracle NoSQL DB, by default, MAX_MEMORY_PERCENT is set to 70% and CacheMode#EVICT_LN is used. The LOB (large object) API is implemented using multiple JE records per LOB where the data size of each record is 1 MB or less.

When a shared cache is configured, the main and off-heap cache may be shared by multiple JE Environments in a single JVM process. See:

EnvironmentConfig#SHARED_CACHE
#getSharedCacheTotalBytes()
#getNSharedCacheEnvironments()

When using Oracle NoSQL DB, the JE shared cache feature is not used because each node only uses a single JE Environment.

Cache Statistics: Size Optimizations

Since a large portion of an IN consists of record keys, JE uses DatabaseConfig#setKeyPrefixing(boolean). Ideally, key suffixes are small enough to be stored using the EnvironmentConfig#TREE_COMPACT_MAX_KEY_LENGTH. The following stat indicates the number of INs using this compact format:

#getNINCompactKeyIN

com.sleepycat.je.evictor.EvictorStatDefinition#CACHED_IN_COMPACT_KEY_DESC

Configuration params impacting key prefixing and the compact key format are:

DatabaseConfig#setKeyPrefixing(boolean)
EnvironmentConfig#TREE_COMPACT_MAX_KEY_LENGTH

Enabling key prefixing for all databases is strongly recommended. When using Oracle NoSQL DB, key prefixing is always enabled.

Another configuration param impacting BIN cache size is TREE_MAX_EMBEDDED_LN. There is currently no stat indicating the number of embedded LNs. See:

EnvironmentConfig#TREE_MAX_EMBEDDED_LN

Cache Statistics: Unexpected Sizes

Although the Btree normally occupies the vast majority of the cache, it is possible that record locks occupy unexpected amounts of cache when large transactions are used, or when cursors or transactions are left open due to application bugs. The following stat indicates the amount of cache used by record locks:

#getLockBytes()

com.sleepycat.je.dbi.DbiStatDefinition#MB_LOCK_BYTES_DESC

To reduce the amount of memory used for record locks:

Use a small number of write operations per transaction. Write locks are held until the end of a transaction.
For transactions using Serializable isolation or RepeatableRead isolation (the default), use a small number of read operations per transaction.
To read large numbers of records, use LockMode#READ_COMMITTED isolation or use a null Transaction (which implies ReadCommitted). With ReadCommitted isolation, locks are released after each read operation. Using LockMode#READ_UNCOMMITTED will also avoid record locks, but does not provide any transactional guarantees.
Ensure that all cursors and transactions are closed promptly.

Note that the above guidelines are also important for reducing contention when records are accessed concurrently from multiple threads and transactions. When using Oracle NoSQL DB, the application should avoid performing a large number of write operations in a single request. For read operations, NoSQL DB uses ReadCommitted isolation to avoid accumulation of locks.

Another unexpected use of cache is possible when using a DiskOrderedCursor or when calling Database#count(). The amount of cache used by these operations is indicated by:

#getDOSBytes

com.sleepycat.je.dbi.DbiStatDefinition#MB_DOS_BYTES_DESC

DiskOrderedCursor and Database.count should normally be explicitly constrained to use a maximum amount of cache memory. See:

DiskOrderedCursorConfig#setInternalMemoryLimit(long)
Database#count(long)

Oracle NoSQL DB does not currently use DiskOrderedCursor or Database.count.

Cache Statistics: Eviction

Eviction is removal of Btree node from the cache in order to make room for newly added nodes. See CacheMode for a description of eviction.

Normally eviction is performed via background threads in the eviction thread pools. Disabling the eviction pool threads is not recommended.

EnvironmentConfig#ENV_RUN_EVICTOR
EnvironmentConfig#ENV_RUN_OFFHEAP_EVICTOR

Eviction stats are important indicator of cache efficiency and provide a deeper understanding of cache behavior. Main cache eviction is indicated by:

#getNLNsEvicted	com.sleepycat.je.evictor.EvictorStatDefinition#EVICTOR_LNS_EVICTED_DESC
#getNNodesMutated	com.sleepycat.je.evictor.EvictorStatDefinition#EVICTOR_NODES_MUTATED_DESC
#getNNodesEvicted	com.sleepycat.je.evictor.EvictorStatDefinition#EVICTOR_NODES_EVICTED_DESC
#getNDirtyNodesEvicted	com.sleepycat.je.evictor.EvictorStatDefinition#EVICTOR_DIRTY_NODES_EVICTED_DESC

Note that objects evicted from the main cache are moved to the off-heap cache whenever possible.

Off-heap cache eviction is indicated by:

#getOffHeapLNsEvicted	com.sleepycat.je.evictor.OffHeapStatDefinition#LNS_EVICTED_DESC
#getOffHeapNodesMutated	com.sleepycat.je.evictor.OffHeapStatDefinition#NODES_MUTATED_DESC
#getOffHeapNodesEvicted	com.sleepycat.je.evictor.OffHeapStatDefinition#NODES_EVICTED_DESC
#getOffHeapDirtyNodesEvicted	com.sleepycat.je.evictor.OffHeapStatDefinition#DIRTY_NODES_EVICTED_DESC

When analyzing Java GC performance, the most relevant stats are NLNsEvicted, NNodesMutated and NNodesEvicted, which all indicate eviction from the main cache based on LRU. Large values for these stats indicate that many old generation Java objects are being GC'd, which is often a cause of GC pauses.

Note that CacheMode#EVICT_LN is used or when LNs are EnvironmentConfig#TREE_MAX_EMBEDDED_LN, NLNsEvicted will be close to zero because LNs are not evicted based on LRU. And if an off-heap cache is configured, NNodesMutated will be close to zero because BIN mutation takes place in the off-heap cache. If any of the three values are large, this points to a potential GC performance problem. The GC logs should be consulted to confirm this.

Large values for NDirtyNodesEvicted or OffHeapDirtyNodesEvicted indicate that the cache is severely undersized and there is a risk of using all available disk space and severe performance problems. Dirty nodes are evicted last (after evicting all non-dirty nodes) because they must be written to disk. This causes excessive writing and JE log cleaning may be unproductive.

Note that when an off-heap cache is configured, NDirtyNodesEvicted will be zero because dirty nodes in the main cache are moved to the off-heap cache if they don't fit in the main cache, and are evicted completely and written to disk only when they don't fit in the off-heap cache.

Another type of eviction tuning for the main cache involves changing the number of bytes evicted each time an evictor thread is awoken:

EnvironmentConfig#EVICTOR_EVICT_BYTES

If the number of bytes is too large, it may cause a noticeable spike in eviction activity, reducing resources available to other threads. If the number of bytes is too small, the overhead of waking the evictor threads more often may be noticeable. The default values for this parameter is generally a good compromise. This parameter also impacts critical eviction, which is described next.

Note that the corresponding parameter for the off-heap cache, EnvironmentConfig#OFFHEAP_EVICT_BYTES, works differently and is described in the next section.

Cache Statistics: Critical Eviction

The following stats indicate that critical eviction is occurring:

#getNBytesEvictedCritical	com.sleepycat.je.evictor.EvictorStatDefinition#N_BYTES_EVICTED_CRITICAL_DESC
#getNBytesEvictedCacheMode	com.sleepycat.je.evictor.EvictorStatDefinition#N_BYTES_EVICTED_CACHEMODE_DESC
#getNBytesEvictedDeamon	com.sleepycat.je.evictor.EvictorStatDefinition#N_BYTES_EVICTED_DAEMON_DESC
#getNBytesEvictedEvictorThread	com.sleepycat.je.evictor.EvictorStatDefinition#N_BYTES_EVICTED_EVICTORTHREAD_DESC
#getNBytesEvictedManual	com.sleepycat.je.evictor.EvictorStatDefinition#N_BYTES_EVICTED_MANUAL_DESC
#getOffHeapCriticalNodesTargeted	com.sleepycat.je.evictor.OffHeapStatDefinition#CRITICAL_NODES_TARGETED_DESC
#getOffHeapNodesTargeted	com.sleepycat.je.evictor.OffHeapStatDefinition#NODES_TARGETED_DESC

Eviction is performed by eviction pool threads, calls to Environment#evictMemory() in application background threads, or via CacheMode#EVICT_LN or CacheMode#EVICT_BIN. If these mechanisms are not sufficient to evict memory from cache as quickly as CRUD operations are adding memory to cache, then critical eviction comes into play. Critical eviction is performed in-line in the thread performing the CRUD operation, which is very undesirable since it increases operation latency.

Critical eviction in the main cache is indicated by large values for NBytesEvictedCritical, as compared to the other NBytesEvictedXXX stats. Critical eviction in the off-heap cache is indicated by large values for OffHeapCriticalNodesTargeted compared to OffHeapNodesTargeted.

Additional stats indicating that background eviction threads may be insufficient are:

#getNThreadUnavailable	com.sleepycat.je.evictor.EvictorStatDefinition#THREAD_UNAVAILABLE_DESC
#getOffHeapThreadUnavailable	com.sleepycat.je.evictor.OffHeapStatDefinition#THREAD_UNAVAILABLE_DESC

Critical eviction can sometimes be reduced by changing EnvironmentConfig#EVICTOR_CRITICAL_PERCENTAGE or modifying the eviction thread pool parameters.

EnvironmentConfig#EVICTOR_CRITICAL_PERCENTAGE
EnvironmentConfig#EVICTOR_CORE_THREADS
EnvironmentConfig#EVICTOR_MAX_THREADS
EnvironmentConfig#EVICTOR_KEEP_ALIVE
EnvironmentConfig#OFFHEAP_CORE_THREADS
EnvironmentConfig#OFFHEAP_MAX_THREADS
EnvironmentConfig#OFFHEAP_KEEP_ALIVE

When using Oracle NoSQL DB, EVICTOR_CRITICAL_PERCENTAGE is set to 20% rather than using the JE default of 0%.

In the main cache, critical eviction uses the same parameter as background eviction for determining how many bytes to evict at one time:

EnvironmentConfig#EVICTOR_EVICT_BYTES

Be careful when increasing this value, since this will cause longer operation latencies when critical eviction is occurring in the main cache.

The corresponding parameter for the off-heap cache, OFFHEAP_EVICT_BYTES, works differently:

EnvironmentConfig#OFFHEAP_EVICT_BYTES

Unlike in the main cache, OFFHEAP_EVICT_BYTES defines the goal for background eviction to be below MAX_OFF_HEAP_MEMORY. The background evictor threads for the off-heap cache attempt to maintain the size of the off-heap cache at MAX_OFF_HEAP_MEMORY -. If the off-heap cache size grows larger than MAX_OFF_HEAP_MEMORY, critical off-heap eviction will occur. The default value for OFFHEAP_EVICT_BYTES is fairly large to ensure that critical eviction does not occur. Be careful when lowering this value.

This approach is intended to prevent the off-heap cache from exceeding its maximum size. If the maximum is exceeded, there is a danger that the JVM process will be killed by the OS. See #getOffHeapAllocFailures().

Cache Statistics: LRU List Contention

Another common tuning issue involves thread contention on the cache LRU lists, although there is no stat to indicate such contention. Since each time a node is accessed it must be moved to the end of the LRU list, a single LRU list would cause contention among threads performing CRUD operations. By default there are 4 LRU lists for each cache. If contention is noticeable on internal Evictor.LRUList or OffHeapCache.LRUList methods, consider increasing the number of LRU lists:

EnvironmentConfig#EVICTOR_N_LRU_LISTS
EnvironmentConfig#OFFHEAP_N_LRU_LISTS

However, note that increasing the number of LRU lists will decrease the accuracy of the LRU.

Cache Statistics: Debugging

The following cache stats are unlikely to be needed for monitoring or tuning, but are sometimes useful for debugging and testing.

#getDataBytes	com.sleepycat.je.dbi.DbiStatDefinition#MB_DATA_BYTES_DESC
#getAdminBytes	com.sleepycat.je.dbi.DbiStatDefinition#MB_ADMIN_BYTES_DESC
#getNNodesTargeted	com.sleepycat.je.evictor.EvictorStatDefinition#EVICTOR_NODES_TARGETED_DESC
#getNNodesStripped	com.sleepycat.je.evictor.EvictorStatDefinition#EVICTOR_NODES_STRIPPED_DESC
#getNNodesPutBack	com.sleepycat.je.evictor.EvictorStatDefinition#EVICTOR_NODES_PUT_BACK_DESC
#getNNodesMovedToDirtyLRU	com.sleepycat.je.evictor.EvictorStatDefinition#EVICTOR_NODES_MOVED_TO_PRI2_LRU_DESC
#getNNodesSkipped	com.sleepycat.je.evictor.EvictorStatDefinition#EVICTOR_NODES_SKIPPED_DESC
#getNRootNodesEvicted	com.sleepycat.je.evictor.EvictorStatDefinition#EVICTOR_ROOT_NODES_EVICTED_DESC
#getNBINsFetchMissRatio	com.sleepycat.je.evictor.EvictorStatDefinition#BIN_FETCH_MISS_RATIO_DESC
#getNINSparseTarget	com.sleepycat.je.evictor.EvictorStatDefinition#CACHED_IN_SPARSE_TARGET_DESC
#getNINNoTarget	com.sleepycat.je.evictor.EvictorStatDefinition#CACHED_IN_NO_TARGET_DESC
#getMixedLRUSize	com.sleepycat.je.evictor.EvictorStatDefinition#PRI1_LRU_SIZE_DESC
#getDirtyLRUSize	com.sleepycat.je.evictor.EvictorStatDefinition#PRI2_LRU_SIZE_DESC
#getOffHeapAllocFailures	com.sleepycat.je.evictor.OffHeapStatDefinition#ALLOC_FAILURE_DESC
#getOffHeapAllocOverflows	com.sleepycat.je.evictor.OffHeapStatDefinition#ALLOC_OVERFLOW_DESC
#getOffHeapNodesStripped	com.sleepycat.je.evictor.OffHeapStatDefinition#NODES_STRIPPED_DESC
#getOffHeapNodesSkipped	com.sleepycat.je.evictor.OffHeapStatDefinition#NODES_SKIPPED_DESC
#getOffHeapLNsLoaded	com.sleepycat.je.evictor.OffHeapStatDefinition#LNS_LOADED_DESC
#getOffHeapLNsStored	com.sleepycat.je.evictor.OffHeapStatDefinition#LNS_STORED_DESC
#getOffHeapBINsLoaded	com.sleepycat.je.evictor.OffHeapStatDefinition#BINS_LOADED_DESC
#getOffHeapBINsStored	com.sleepycat.je.evictor.OffHeapStatDefinition#BINS_STORED_DESC
#getOffHeapTotalBlocks	com.sleepycat.je.evictor.OffHeapStatDefinition#TOTAL_BLOCKS_DESC
#getOffHeapLRUSize	com.sleepycat.je.evictor.OffHeapStatDefinition#LRU_SIZE_DESC

Likewise, the following cache configuration params are unlikely to be needed for tuning, but are sometimes useful for debugging and testing.

EnvironmentConfig#ENV_DB_EVICTION
EnvironmentConfig#TREE_MIN_MEMORY
EnvironmentConfig#EVICTOR_FORCED_YIELD
EnvironmentConfig#EVICTOR_ALLOW_BIN_DELTAS
EnvironmentConfig#OFFHEAP_CHECKSUM

Cleaning Statistics

Group Name: com.sleepycat.je.cleaner.CleanerStatDefinition#GROUP_NAME
Description: com.sleepycat.je.cleaner.CleanerStatDefinition#GROUP_DESC

The JE cleaner is responsible for "disk garbage collection" within JE's log structured (append only) storage system. Data files (.jdb files), which are also called log files, are cleaned and deleted as their contents become obsolete. See this introduction to JE data files.

Cleaning Statistics: Utilization

By utilization we mean the ratio of utilized size to the total size of the active data files. The cleaner is run when overall utilization (for all active files) drops below the target utilization, which is specified by EnvironmentConfig#CLEANER_MIN_UTILIZATION. The cleaner attempts to maintain overall utilization at the target level. In addition, a file will be cleaned if its individual utilization drops below EnvironmentConfig#CLEANER_MIN_FILE_UTILIZATION, irrespective of overall utilization.

Current (actual) utilization is indicated by the following stats.

#getCurrentMinUtilization	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_MIN_UTILIZATION_DESC
#getCurrentMaxUtilization	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_MAX_UTILIZATION_DESC

If TTL is not used, the minimum and maximum utilization will be the same. If TTL is used, the minimum and maximum define a range that bounds the actual utilization. The current utilization is not known precisely when TTL is used because of the potential overlap between expired data and data that has become obsolete due to updates or deletions. See Cleaning Statistics: TTL and expired data.

If the cleaner is successfully maintaining the target utilization, the current utilization (indicated by the above stats) is normally slightly lower than the target utilization. This is because the cleaner is not activated until current utilization drops below the target utilization and it takes time for the cleaner to free space and raise the current utilization. If the current utilization is significantly lower than the target utilization (e.g., more than five percentage points lower), this typically means the cleaner is unable to maintain the target utilization. (When the minimum and maximum utilization stats are unequal, we recommend using the maximum utilization for this determination.)

When the cleaner is unable to maintain the target utilization, it will clean files continuously in an attempt to reach the target. This will use significant system resources in the best case and will use all available disk space in the worst case, so the source of the problem should be identified and corrected using the guidelines below.

One possibility is that the cleaner is unable to keep up simply because there are many more application threads generating waste than there are cleaner threads. To rule this out, try increasing the number of EnvironmentConfig#CLEANER_THREADS. For example, the NoSQL DB product uses two cleaner threads.
The cleaner may be able to keep up with generated waste, but due to cleaning efficiency factors (explained in the next section) it may not be able to maintain the configured target utilization, or it may be consuming large amounts of resources in order to do so. In this case, configuring a lower EnvironmentConfig#CLEANER_MIN_UTILIZATION is one solution. For example, the NoSQL DB product uses a target utilization of 40%. See the next section for additional guidelines.
In extreme cases, cleaning efficiency factors make it impossible for the cleaner to make forward progress, meaning that more obsolete space is generated by cleaning than can be reclaimed. This will eventually result in using all available disk space. To avoid this, follow the guidelines above and in the next section.

Cleaning Efficiency

The general guidelines for ensuring that the cleaner can maintain the target utilization are:

ensure that the JE cache is sized appropriately, and
avoid large record keys, especially when the data size is small yet too large to be embedded in the BIN (as discussed below).

This remainder of this section is intended to help understand the reasons for these recommendations and to aid in advanced tuning.

A JE data file consists mainly of Btree nodes, which include internal nodes (INs) and leaf nodes (LNs). Each IN contains the keys of roughly EnvironmentConfig#NODE_MAX_ENTRIES, while an LN contains the key and data of a single record. When the cleaner processes a data file it migrates (copies) active LNs to the end of the log, and dirties their parent BINs (bottom internal nodes). Active INs are dirtied but not immediately copied. The next checkpoint will then write the INs dirtied as a result of cleaning the file:

The BIN parents of the active LNs from the cleaned file.
The active INs from the cleaned file, and their parent INs.

Finally, now that the persistent form of the Btree contains no references to the cleaned file, the file can be deleted. (In HA environments the file is not deleted immediately as will be discussed.)

When LNs are migrated, logging of their dirtied parent BINs causes the previous version of these BINs to become obsolete. In many cases the previous version may be in a different file than the cleaned file. So although the cleaner reclaims the space for the obsolete data in the cleaned file, it also creates some amount of additional obsolete space.

The ratio of reclaimed space to additional obsolete space determines the maximum utilization that can result from cleaning. If this maximum is less than the target utilization, the cleaner will run continuously in an attempt to reach the target and will consume large amounts of system resources. Several factors influence how much additional obsolete space is created:

The lower the utilization of the file selected for cleaning, the less active LNs are migrated. This means less parent BINs are dirtied and logged, and therefore less obsolete space is created. For this reason, specifying a lower target utilization will cause cleaning to be less expensive. Also, this is why JE always selects files with the lowest utilization for cleaning. Some application workloads vary over time and create a mix of high and low utilization files, while others are consistent over time and all files have the same utilization; cleaning will be more efficient for workloads of the first type.
A special case is when a records' data is stored (embedded) in the BIN. This is referred to as an embedded LN. Embedded LNs are not migrated by the cleaner (they are no longer needed after transaction processing), so embedded LNs do not cause the creation of additional obsolete space during cleaning. LNs are embedded in two situations:
1. All LNs are embedded in a DatabaseConfig#setSortedDuplicates. Such databases are normally SecondaryDatabases.
2. In a DB where duplicate keys are not allowed (which is the default for a Database), LNs are embedded when the data size is no larger than EnvironmentConfig#TREE_MAX_EMBEDDED_LN. Such databases are normally primary databases, but in rare cases can be SecondaryDatabases.
When non-embedded LNs have a relatively large data size, less LNs per file are migrated and therefore less obsolete space is created. On the other hand, when the data size is small, yet too large to be EnvironmentConfig#TREE_MAX_EMBEDDED_LN, significant amounts of obsolete space may be created by cleaning. This is because many LNs are migrated per file, and for each of these LNs a BIN is dirtied. EnvironmentConfig#TREE_MAX_EMBEDDED_LN can be increased to solve this problem in some cases, but (as described in its javadoc) increasing this value will increase BIN cache usage and should be done with caution.
When LNs have a relatively large key size, their parent BINs are also larger. When these LNs are not embedded, the larger BIN size means that more obsolete space is created by cleaning. Even when the LNs are embedded, normal write operations will create more obsolete space and BIN cache usage will be increased.
For the reasons stated above, a worst case for creation of obsolete space during cleaning is when LNs have large keys and small data, yet not small enough to be embedded.
The larger the EnvironmentConfig#CHECKPOINTER_BYTES_INTERVAL, the more likely it is that migration of two or more LNs will dirty a single parent BIN (assuming the absence of cache eviction). This causes less BINs to be logged as a result of migration, so less obsolete space is created. In other words, increasing the checkpoint interval increases write absorption. This is true for ordinary record write operations as well as LN migration.

Even when cleaning does not create significant amounts of additional obsolete space, an undersized cache can still prevent the cleaner from maintaining the target utilization when eviction of dirty BINs occurs. When eviction causes logging of dirty BINs, this reduces or even cancels out the write absorption benefits that normally occur due to periodic checkpoints. In the worst case, every record write operation causes a BIN to be written as well, which means that large amounts of obsolete data will be created at a high rate. The #getNDirtyNodesEvicted() and #getOffHeapDirtyNodesEvicted()cache statistics can help to identify this problem.

Even when cleaning is maintaining the target utilization, it may consume large amounts of system resources in order to do so. The following indicators of cleaning activity can be used to get a rough idea of the level of cleaning activity.

#getNCleanerRuns	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_RUNS_DESC
#getNCleanerEntriesRead	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_ENTRIES_READ_DESC

As mentioned earlier, configuring a lower EnvironmentConfig#CLEANER_MIN_UTILIZATION is one way to reduce cleaner resource consumption.

The read IO caused by cleaning is indicated by the following stat:

#getNCleanerDiskRead

com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_DISK_READS_DESC

The impact of cleaner read IO can sometimes be reduced by increasing the EnvironmentConfig#CLEANER_READ_SIZE.

The write IO caused by cleaning is due to #getNLNsMigrated() and by logging of INs that were dirtied by the cleaner. Both of these costs can be reduced by decreasing the EnvironmentConfig#CLEANER_MIN_UTILIZATION. Logging of dirty INs can also be reduced by using smaller key sizes, especially when the data size is small, yet too large to be embedded in the BIN.

When a workload involves inserting and deleting large numbers of records, another way of increasing cleaner efficiency is to remove the records using WriteOptions#setTTL(int) or ExtinctionFilter, rather than performing transactional record deletions. When records have expired or become extinct, the cleaner can discard the LNs without a Btree lookup as described in the next section. Also, because there are no transactional deletions there is less cleaner metadata and less writing overall.

Cleaning Statistics: Processing Details

This section describes details of cleaner file processing. The stats in this section are useful for internal analysis and debugging.

When the cleaner processes a data file, it reads the Btree entries: LNs, BIN-deltas, BINs and upper INs. The number of entries processed is #getNCleanerEntriesRead(). (There are a small number of additional non-Btree entries in each file that are always obsolete and are completely ignored by the cleaner.)

The first step of processing a Btree entry is to determine if it is known-obsolete. A known-obsolete entry is one of the following:

A Btree entry that was recorded as obsolete in the cleaner's per-file metadata during transaction processing.
A Btree entry that belongs to a Database that has been Environment#removeDatabase or Environment#truncateDatabase. Note that DBs are added to a pending DB queue if the removal or truncation is not yet complete; this is discussed in the next section.
An LN entry representing a record deletion in the transaction log.
An LN that has #getNLNsExpired(). Expired LNs result from the use of WriteOptions#setTTL(int).
An LN that has become #getNLNsExtinct(). Extinct LNs result from using an ExtinctionFilter along with the Environment#discardExtinctRecordsmethod.

Known-obsolete entries are very inexpensive to process because no Btree lookup is required to determine that they are obsolete, and they can simply be discarded. The number of known-obsolete entries is the sum of the following XxxObsolete stats:

#getNLNsObsolete	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_LNS_OBSOLETE_DESC
#getNINsObsolete	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_INS_OBSOLETE_DESC
#getNBINDeltasObsolete	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_BIN_DELTAS_OBSOLETE_DESC

Entries that are not known-obsolete must be processed by performing a Btree lookup to determine whether they're active or obsolete. These are indicated by the following XxxCleaned stats:

#getNLNsCleaned	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_LNS_CLEANED_DESC
#getNINsCleaned	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_INS_CLEANED_DESC
#getNBINDeltasCleaned	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_BIN_DELTAS_CLEANED_DESC

The sum of the XxxObsolete and XxxCleaned stats is the #getNCleanerEntriesRead():

CleanerEntriesRead = 
(LNsObsolete + INsObsolete + BINDeltasObsolete) + 
(LNsCleaned + INsCleaned + BINDeltasCleaned)

The number of expired and extinct LNs are broken out as separate stats. These are a subset of the known-obsolete LNs:

#getNLNsExpired	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_LNS_EXPIRED_DESC
#getNLNsExtinct	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_LNS_EXTINCT_DESC

If the Btree lookup does not find the entry, then it is actually obsolete. This can happen for two reasons:

The obsolete entry was not recorded as obsolete in the cleaner metadata during transaction processing. The recording of this metadata is not always guaranteed.
The entry became obsolete during processing of the file. The cleaner loads its metadata when file processing starts, and this metadata is not updated during file processing.

Such entries are indicated by the XxxDead stats:

#getNLNsDead	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_LNS_DEAD_DESC
#getNINsDead	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_INS_DEAD_DESC
#getNBINDeltasDead	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_BIN_DELTAS_DEAD_DESC

If the entry is active in the Btree, it must be preserved by the cleaner. Such entries are indicated by the following stats:

#getNLNsMigrated	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_LNS_MIGRATED_DESC
#getNLNsMarked	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_LNS_MARKED_DESC
#getNLNsLocked	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_LNS_LOCKED_DESC
#getNINsMigrated	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_INS_MIGRATED_DESC
#getNBINDeltasMigrated	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_BIN_DELTAS_MIGRATED_DESC

The stats above have the following meanings:

#getNLNsMigrated() are logged when they are processed by the cleaner.
#getNLNsMarked() are active LNs in DatabaseConfig#setTemporary that are marked dirty. They will be logged only if they are evicted from cache.
#getNLNsLocked() cannot be processed immediately and they are added to a pending queue. Pending LNs are discussed in the next section.
#getNINsMigrated() are simply marked dirty, and they will be logged by the next checkpoint.
#getNBINDeltasMigrated() are also simply marked dirty.

The stats above provide a break down of cleaned entries as follows:

LNsCleaned = LNsDead + LNsMigrated + LNsMarked +
INsCleaned = INsDead + INsMigrated
BINDeltasCleaned = BINDeltasDead + BINDeltasMigrated

When LNs are processed, a queue is used to reduce Btree lookups. LNs are added to the queue when cleaning is needed (they are not known-obsolete). When the queue fills, the oldest LN in the queue is processed. If the LN is found in the Btree, the other LNs in the queue are checked to see if they have the same parent BIN. If so, these LNs can be processed while the BIN is latched, without an additional Btree lookup. The number of such LNs is indicated by the following stat:

#getNLNQueueHits

com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_LNQUEUE_HITS_DESC

The LN queue is most beneficial when LNs are inserted or updated in key order. The maximum size of the queue, expressed as its maximum memory size, can be changed via the EnvironmentConfig#CLEANER_LOOK_AHEAD_CACHE_SIZE param.

Cleaning Statistics: Pending LNs and DBs

When the cleaner is processing a Btree entry (LN or IN) there are two cases where completion of cleaning (and deletion of the file) must be deferred.

If an LN that is potentially active (not known-obsolete) is write-locked, the cleaner cannot determine whether it must be migrated until the locking transaction ends, either by aborting or committing.
If an LN or IN belongs to a Database that is in the process of being removed or truncated, the LN or IN is considered known-obsolete but cleaner must wait until the DB removal/truncation is complete before the file can be deleted.

If one of these conditions occurs, the LN or DB is added to a pending queue. The cleaner will periodically process the entries in the queue and attempt to resolve them as follows.

When a pending LN is no longer write-locked, a Btree lookup is performed and the LN is either migrated or considered dead. The LN is removed from the pending queue.
When removal/truncation is complete for a pending DB, the DB is simply removed from the pending queue.

When there are no more pending LNs and DBs for a given file then cleaning of the file will be considered complete and it will become a candidate for deletion after the next checkpoint. If a pending entry causes file deletion to be delayed, because the pending entries cannot be resolved before the next checkpoint, a WARNING level message is logged with more information about the pending entries.

The following stats indicate the size of the pending LN queues, how many LNs in the queue have been processed, and of those processed how many remain unresolved because the record is still write-locked.

#getPendingLNQueueSize	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_PENDING_LN_QUEUE_SIZE_DESC
#getNPendingLNsProcessed	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_PENDING_LNS_PROCESSED_DESC
#getNPendingLNsLocked	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_PENDING_LNS_LOCKED_DESC

If pending LNs remain unresolved, this could mean an application or JE bug has prevented a write-lock from being released. This could happen, for example, if the application fails to end a transaction or close a cursor. For such bugs, closing and re-opening the Environment is usually needed to allow file deletion to proceed. If this occurs for multiple files and is not resolved, it can eventually lead to an out-of-disk situation.

The following stats indicate the size of the pending DB queue, how many DBs in the queue have been processed, and of those processed how many remain unresolved because the removal/truncation is still incomplete.

#getPendingDBQueueSize	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_PENDING_DB_QUEUE_SIZE_DESC
#getNPendingDBsProcessed	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_PENDING_DBS_PROCESSED_DESC
#getNPendingDBsIncomplete	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_PENDING_DBS_INCOMPLETE_DESC

If pending DBs remain unresolved, this may indicate that the asynchronous portion of DB removal/truncation is taking longer than expected. After a DB removal/truncation transaction is committed, JE asynchronously counts the data for the DB obsolete.

Cleaning Statistics: TTL and expired data

When the WriteOptions#setTTL(int) feature is used, the obsolete portion of the log includes data that has expired. An expiration histogram is stored for each file and is used to compute the expired size. The current #getCurrentMinUtilization() and #getCurrentMaxUtilization() utilization are the lower and upper bounds of computed utilization. They are different only when the TTL feature is used, and some data in the file has expired while other data has become obsolete for other reasons, such as record updates, record deletions or checkpoints. In this case the strictly obsolete size and the expired size may overlap because they are maintained separately.

If the two sizes overlap completely then the minimum utilization is correct, while if there is no overlap then the maximum utilization is correct. Both utilization values trigger cleaning, but when there is significant overlap, the cleaner will perform two-pass cleaning. The following stats indicate the use of two-pass cleaning:

#getNCleanerTwoPassRuns	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_TWO_PASS_RUNS_DESC
#getNCleanerRevisalRuns	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_REVISAL_RUNS_DESC

In the first pass of two-pass cleaning, the file is read to recompute obsolete and expired sizes, but the file is not cleaned. As a result of recomputing the expired sizes, the strictly obsolete and expired sizes will no longer overlap, and the minimum and maximum utilization will be equal. If the file should still be cleaned, based on the recomputed utilization, it is cleaned as usual, and in this case the number of #getNCleanerTwoPassRuns() is incremented.

If the file should not be cleaned because its recomputed utilization is higher than expected, the file will not be cleaned. Instead, its recomputed expiration histogram, which now has size information that does not overlap with the strictly obsolete data, is stored for future use. By storing the revised histogram, the cleaner can select the most appropriate files for cleaning in the future. In this case the number of #getNCleanerRevisalRuns() is incremented, and the number of #getNCleanerRuns() is not incremented.

Cleaning Statistics: Disk Space Management

The JE cleaner component is also responsible for checking and enforcing the EnvironmentConfig#MAX_DISK and EnvironmentConfig#FREE_DISK limits, and for protecting cleaned files from deletion while they are in use by replication, backups, etc. This process is described in the EnvironmentConfig#MAX_DISK javadoc. The stats related to disk space management are:

#getActiveLogSize()	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_ACTIVE_LOG_SIZE_DESC
#getAvailableLogSize()	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_AVAILABLE_LOG_SIZE_DESC
#getReservedLogSize()	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_RESERVED_LOG_SIZE_DESC
#getProtectedLogSize()	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_PROTECTED_LOG_SIZE_DESC
#getProtectedLogSizeMap()	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_PROTECTED_LOG_SIZE_MAP_DESC
#getTotalLogSize()	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_TOTAL_LOG_SIZE_DESC
#getNCleanerDeletions()	com.sleepycat.je.cleaner.CleanerStatDefinition#CLEANER_DELETIONS_DESC

The space taken by all data files, totalLogSize, is divided into categories according to these stats as illustrated below.

 
/--------------------------------------------------\ 
|                                                  | 
| Active files -- have not been cleaned            | 
|                 and cannot be deleted            | 
|                                                  | 
|             Utilization =                        | 
|    (utilized size) / (total active size)         | 
|                                                  | 
|--------------------------------------------------| 
|                                                  | 
| Reserved files -- have been cleaned and          | 
|                   can be deleted                 | 
|                                                  | 
| /----------------------------------------------\ | 
| |                                              | | 
| | Protected files -- temporarily in use by     | | 
| |                    replication, backups, etc.| | 
| |                                              | | 
| \----------------------------------------------/ | 
|                                                  | 
\--------------------------------------------------/

A key point is that reserved data files will be deleted by JE automatically to prevent violation of a disk limit, as long as the files are not protected. This has two important implications:

The #getCurrentMinUtilization() stats are calculated based only on the active data files. Reserved files are ignored in this calculation.
The #getAvailableLogSize() stat includes the size of the reserved files that are not protected. These files will be deleted automatically, if this is necessary to allow write operations.

We strongly recommend using availableLogSize to monitor disk usage and take corrective action well before this value reaches zero. Monitoring the file system free space is not a substitute for this, since the data files include reserved files that will be deleted by JE automatically.

Applications should normally define a threshold for availableLogSize and raise an alert of some kind when the threshold is reached. When this happens applications may wish to free space (by deleting records, for example) or expand storage capacity. If JE write operations are needed as part of this procedure, corrective action must be taken while there is still enough space available to perform the write operations.

For example, to free space by deleting records requires enough space to log the deletions, and enough temporary space for the cleaner to reclaim space for the deleted records. As described in the sections above, the cleaner uses more disk space temporarily in order to migrate LNs, and a checkpoint must be performed before deleting the cleaned files.

How much available space is needed is application specific and testing may be required to determine the application's availableLogSizethreshold. Note that the default EnvironmentConfig#FREE_DISKvalue, five GB, may or may not be large enough to perform the application's recovery procedure. The default FREE_DISK limit is intended to reserve space for recovery when application monitoring of availableLogSize fails and emergency measures must be taken.

If availableLogSize is unexpectedly low, it is possible that protected files are preventing space from being reclaimed. This could be due to replication, backups, etc. See #getReservedLogSize() and #getProtectedLogSizeMap() for more information.

It is also possible that data files cannot be deleted due to read-only processes. When one process opens a JE environment in read-write mode and one or more additional processes open the environment in EnvironmentConfig#setReadOnly(boolean) mode, the read-only processes will prevent the read-write process from deleting data files. For this reason, long running read-only processes are strongly discouraged in a production environment. When data file deletion is prevented for this reason, a SEVERE level message is logged with more information.

I/O Statistics

Group Name: com.sleepycat.je.log.LogStatDefinition#GROUP_NAME
Description: com.sleepycat.je.log.LogStatDefinition#GROUP_DESC

I/O Statistics: File Access

JE accesses data files (.jdb files) via Java's standard file system APIs. Because opening a file is relatively expensive, an LRU-based cache of open file handles is maintained. The stats below indicate how many cached file handles are currently open and how many open file operations have taken place.

#getNOpenFiles()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_OPEN_FILES_DESC
#getNFileOpens()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_FILE_OPENS_DESC

To prevent expensive file open operations during record read operations, set EnvironmentConfig#LOG_FILE_CACHE_SIZE to the maximum number of data files expected in the Environment.

Note that JE may open the same file more than once. If a read operation in one thread is accessing a file via its cached handle and another thread attempts to read from the same file, a temporary handle is opened just for the duration of the read. The #getNFileOpens() stat includes open operations for both cached file handles and temporary file handles. Therefore, this stat cannot be used to determine whether the file cache is too small.

When a file read is performed, it is always possible for the read buffer size to be smaller than the log entry being read. This is because JE's append-only log contains variable sized entries rather than pages. If the read buffer is too small to contain the entire entry, a repeat read with a larger buffer must be performed. These additional reads can be reduced by monitoring the following two stats and increasing the read buffer size as described below.

When Btree nodes are read at known file locations (by user API operations, for example), the following stat indicates the number of repeat reads:

#getNRepeatFaultReads()

com.sleepycat.je.log.LogStatDefinition#LOGMGR_REPEAT_FAULT_READS_DESC

When the number of #getNRepeatFaultReads() is significant, consider increasing EnvironmentConfig#LOG_FAULT_READ_SIZE.

When data files are read sequentially (by the cleaner, for example) the following stat indicates the number of repeat reads:

#getNRepeatIteratorReads()

com.sleepycat.je.log.LogStatDefinition#LOGMGR_REPEAT_ITERATOR_READS_DESC

When the number of #getNRepeatIteratorReads() is significant, consider increasing EnvironmentConfig#LOG_ITERATOR_MAX_SIZE.

The two groups of stats below indicate JE file system reads and writes as number of operations and number of bytes. These stats are roughly divided into random and sequential operations by assuming that storage devices can optimize for sequential access if two consecutive operations are performed one MB or less apart in the same file. This categorization is approximate and may differ from the actual number depending on the type of disks and file system, disk geometry, and file system cache size.

The JE file read and write stats can sometimes be useful for debugging or for getting a rough idea of I/O characteristics. However, monitoring of system level I/O stats (e.g., using iostat) gives a more accurate picture of actual I/O since access via the buffer cache is not included. In addition the JE stats are not broken out by operation type and therefore don't add a lot of useful information to the system level I/O stats, other than the rough division of random and sequential I/O.

The JE file read stats are:

#getNRandomReads()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_RANDOM_READS_DESC
#getNRandomReadBytes()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_RANDOM_READ_BYTES_DESC
#getNSequentialReads()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_SEQUENTIAL_READS_DESC
#getNSequentialReadBytes()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_SEQUENTIAL_READ_BYTES_DESC

JE file read stats include file access resulting from the following operations. Because internal operations are included, it is not practical to correlate these stats directly to user operations.

User read operations via the JE APIs, e.g., Database#get(Transaction,DatabaseEntry,DatabaseEntry,Get,ReadOptions) and Cursor#get(DatabaseEntry,DatabaseEntry,Get,ReadOptions).
Utility operations that access records such as EnvironmentConfig#VERIFY_BTREE.
In a replicated environment, reads are performed by replication and mastership changes (syncup). In all environments, reads are performed by recovery (Environment open) and log cleaning. Log cleaning is typically a significant contributor to read I/O.
Note that the reads above can cause more read I/O than expected when #getNRepeatFaultReads() or #getNRepeatIteratorReads() are consistently non-zero.
Note that while EnvironmentConfig#VERIFY_LOG does perform read I/O, this I/O is not included in the JE file read stats. The same is true for a com.sleepycat.je.rep.NetworkRestore in a replicated environment: the read I/O on the source node is not counted in the JE file read stats.

The JE file write stats are:

#getNRandomWrites()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_RANDOM_WRITES_DESC
#getNRandomWriteBytes()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_RANDOM_WRITE_BYTES_DESC
#getNSequentialWrites()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_SEQUENTIAL_WRITES_DESC
#getNSequentialWriteBytes()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_SEQUENTIAL_WRITE_BYTES_DESC

JE file write stats include file access resulting from the following operations. As with the read stats, because internal operations are included it is not practical to correlate the write stats directly to user operations.

User write operations via the JE APIs, e.g., Database#put(Transaction,DatabaseEntry,DatabaseEntry,Put,WriteOptions) and Cursor#put(DatabaseEntry,DatabaseEntry,Put,WriteOptions).
A small number of internal record write operations are performed to maintain JE internal data structures, e.g., cleaner metadata.
Writes are performed by checkpointing and eviction. Checkpoints are typically a significant contributor to write I/O.
Note that while com.sleepycat.je.rep.NetworkRestore does perform write I/O, this I/O is not included in the JE file write stats.

I/O Statistics: Logging Critical Section

JE uses an append-only storage system where each log entry is assigned an LSN (log sequence number). The LSN is a 64-bit integer consisting of two 32-bit parts: the file number is the high order 32-bits and the file offset is the low order 32-bits.

LSNs are used in the Btree to reference child nodes from their parent node. Therefore a node's LSN is assigned when the node is written, including the case where the write is buffered. The next LSN to be assigned is indicated by the following stat:

#getEndOfLog()

com.sleepycat.je.log.LogStatDefinition#LOGMGR_END_OF_LOG_DESC

LSN assignment and assignment of log buffer space must be performed serially, and therefore these operations occur in a logging critical section. In general JE strives to do as little additional work as possible in the logging critical section. However, in certain cases additional operations are performed in the critical section and these generally impact performance negatively. These special cases will be noted in the sections that follow.

I/O Statistics: Log Buffers

A set of JE log buffers is used to buffer writes. When write operations use SyncPolicy#NO_SYNC, a file write is not performed until a log buffer is filled. This positively impacts performance by reducing the number of file writes. Note that checkpoint writes use NO_SYNC, so this benefits performance even when user operations do not use NO_SYNC.

(When SyncPolicy#SYNC or SyncPolicy#WRITE_NO_SYNCis used, the required file write and fsync are performed using a group commit mechanism, which is described further below.)

The size and number of log buffers is configured using EnvironmentConfig#LOG_BUFFER_SIZE, EnvironmentConfig#LOG_NUM_BUFFERS and EnvironmentConfig#LOG_TOTAL_BUFFER_BYTES. The resulting total size and number of buffers is indicated by the following stats:

#getBufferBytes()	com.sleepycat.je.log.LogStatDefinition#LBFP_BUFFER_BYTES_DESC
#getNLogBuffers()	com.sleepycat.je.log.LogStatDefinition#LBFP_LOG_BUFFERS_DESC

The default buffer size (one MB) is expected to be optimal for most applications. In NoSQL DB, the default buffer size is used. However, if an individual entry (e.g., a BIN or LN) is larger than the buffer size, the log buffer mechanism is bypassed and this can negatively impact performance. When writing such an entry, the write occurs in the critical section using a temporary buffer, and any dirty log buffers all also written in the critical section. When this occurs it is indicated by the following stat:

#getNTempBufferWrites()

com.sleepycat.je.log.LogStatDefinition#LOGMGR_TEMP_BUFFER_WRITES_DESC

When #getNTempBufferWrites() is consistently non-zero, consider increasing the log buffer size.

The number of buffers also impacts write performance when many threads are performing write operations. The use of multiple buffers allows one writing thread to flush the completed dirty buffers while other writing threads add entries to "clean" buffers (that have already been written).

If many threads are adding to clean buffers while the completed dirty buffers are being written, it is possible that no more clean buffers will be available for adding entries. When this happens, the dirty buffers are flushed in the critical section, which can negatively impact performance. This is indicated by the following stat:

#getNNoFreeBuffer()

com.sleepycat.je.log.LogStatDefinition#LBFP_NO_FREE_BUFFER_DESC

When #getNNoFreeBuffer() is consistently non-zero, consider increasing the number of log buffers.

The number of log buffers also impacts read performance. JE read operations use the log buffers to read entries that were recently written. This occurs infrequently in the case of user read operations via the JE APIs, since recently written data is infrequently read and is often resident in the cache. However, it does occur frequently and is an important factor in the following cases:

A transaction abort reads the entries written by the transaction in order to undo them. These entries should normally be available in the log buffers. Avoiding file reads reduces the latency of aborted transactions.
In a replicated environment, the master node must read recently written entries needed by replicas or secondary nodes. By reading these entries from the log buffers, the likelihood of the need for file reads is reduced, and this can prevent lagging replicas from falling further behind.

Because of the last point above involving replication, in NoSQL DB the number of log buffers is set to 16. In general we recommend configuring 16 buffers or more for a replicated environment.

The following stats indicate the number of requests to read log entries by LSN, and the number that were not found in the log buffers.

#getNNotResident()	com.sleepycat.je.log.LogStatDefinition#LBFP_NOT_RESIDENT_DESC
#getNCacheMiss()	com.sleepycat.je.log.LogStatDefinition#LBFP_MISS_DESC

In general these two stats are used only for internal JE debugging and are not useful to the application. This is because #getNNotResident() is roughly the sum of the VLSNIndex nMisses replication stat and the cache fetch miss stats: #getNLNsFetchMiss(), #getNBINsFetchMiss(), #getNFullBINsMiss and #getNUpperINsFetchMiss().

I/O Statistics: The Write Queue

JE performs special locking to prevent an fsync and a file write from executing concurrently. TODO: Why is this disallowed for all file systems?.

The write queue is a single, low-level buffer that reduces blocking due to a concurrent fsync and file write request. When a write of a dirty log buffer is needed to free a log buffer for a SyncPolicy#NO_SYNC operation (i.e., durability is not required), the write queue is used to hold the data temporarily and allow a log buffer to be freed.

Use of the write queue is strongly recommended since there is no known drawback to using it. It is enabled by default and in NoSQL DB. However, it can be disabled if desired by setting EnvironmentConfig#LOG_USE_WRITE_QUEUE to false.

The following stats indicate use of the write queue for satisfying file write and read requests. Note that when the write queue is enabled, all file read requests must check the write queue to avoid returning stale data.

#getNWritesFromWriteQueue()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_WRITES_FROM_WRITEQUEUE_DESC
#getNBytesWrittenFromWriteQueue()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_BYTES_WRITTEN_FROM_WRITEQUEUE_DESC
#getNReadsFromWriteQueue()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_READS_FROM_WRITEQUEUE_DESC
#getNBytesReadFromWriteQueue()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_BYTES_READ_FROM_WRITEQUEUE_DESC

The default size of the write queue (one MB) is expected to be adequate for most applications. Note that the write queue size should never be smaller than the log buffer size (which is also one MB by default). In NoSQL DB, the default sizes for the write queue and the log buffer are used.

However, when many NO_SYNC writes are requested during an fsync, some write requests may have to block until the fsync is complete. This is indicated by the following stats:

#getNWriteQueueOverflow()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_WRITEQUEUE_OVERFLOW_DESC
#getNWriteQueueOverflowFailures()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_WRITEQUEUE_OVERFLOW_FAILURES_DESC

When a NO_SYNC write request occurs during an fsync and the size of the write request's data is larger than the free space in the write queue, the #getNWriteQueueOverflow() stat is incremented. When this stat is consistently non-zero, consider the following possible reasons and remedies:

The log buffer size may be larger than the write queue size. Ensure that the EnvironmentConfig#LOG_WRITE_QUEUE_SIZE is at least as large as the EnvironmentConfig#LOG_BUFFER_SIZE.
An individual log entry (e.g., a BIN or LN) may be larger than the log buffer size, as indicated by the #getNTempBufferWrites()stat. Consider increasing LOG_WRITE_QUEUE_SIZE and/or LOG_BUFFER_SIZE such that #getNTempBufferWrites() is consistently zero.
When multiple threads perform NO_SYNC write requests during a single fync, the write queue may not be large enough to prevent overflows. After the two causes above have been ruled out, consider increasing the LOG_WRITE_QUEUE_SIZE.

When such a write queue overflow occurs, JE will wait for the fsync to complete, empty the write queue by writing it to the file, and attempt again to add the data to the write queue. If this fails again because there is still not enough free space in the write queue, then the #getNWriteQueueOverflowFailures() stat is incremented. In this case the data is written to the file rather than adding it to the write queue, even though this may require waiting for an fsync to complete.

If #getNWriteQueueOverflowFailures() is consistently non-zero, the possible causes are the same as those listed above, and the remedies described above should be applied.

I/O Statistics: Fsync and Group Commit

When SyncPolicy#SYNC or SyncPolicy#WRITE_NO_SYNC is used for transactional write operations, the required file write and fsync are performed using a group commit mechanism. In the presence of concurrent transactions, this mechanism often allows performing a single write and fsync for multiple transactions, while still ensuring that the write and fsync are performed before the transaction commit()method (or the put() or delete() operation method in this case of auto-commit) returns successfully.

First note that not all file write and fsync operations are due to user transaction commits, and not all fsyncs use the group commit mechanism.

In several cases a transaction is committed for an internal database, or a special log entry is written for an internal operation, with SYNC or WRITE_NO_SYNC durability. In these cases the group commit mechanism is used.
When Environment#flushLog is called by the application, a file write is performed and, if the fsync parameter is true, an fsync is also performed. A file write and fsync are also performed after an interval of no write activity, as determined by EnvironmentConfig#LOG_FLUSH_NO_SYNC_INTERVAL and EnvironmentConfig#LOG_FLUSH_SYNC_INTERVAL. In these cases the group commit mechanism is not used.
In several cases an fsync is performed internally that does not use the group commit mechanism. These cases include: completion of the last data file when a new file is created, flushing of metadata before deleting a cleaned file, and Environment open and close.

The following stats describe all fsyncs performed by JE, whether or not the group commit mechanism is used.

#getNLogFSyncs()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_LOG_FSYNCS_DESC
#getFSyncAvgMs()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_FSYNC_AVG_MS_DESC
#getFSync95Ms()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_FSYNC_95_MS_DESC
#getFSync99Ms()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_FSYNC_99_MS_DESC
#getFSyncMaxMs()	com.sleepycat.je.log.LogStatDefinition#FILEMGR_FSYNC_MAX_MS_DESC

Long fsync times often result in long transaction latencies. When this is indicated by the above stats, be sure to ensure that the linux page cache has been tuned to permit the OS to write asynchronously to disk whenever possible. For the NoSQL DB product this is described under Linux Page Cache Tuning. To aid in diagnosing long fsyncs, a WARNING level message is logged when the maximum fsync time exceeds EnvironmentConfig#LOG_FSYNC_TIME_LIMIT

The following stats indicate when group commit is requested for a write operation. Group commit requests include all user transactions with SYNC or WRITE_NO_SYNC durability, as well as the internal JE write operations that use group commit.

#getNGroupCommitRequests()	com.sleepycat.je.log.LogStatDefinition#FSYNCMGR_N_GROUP_COMMIT_REQUESTS_DESC
#getNFSyncRequests()	com.sleepycat.je.log.LogStatDefinition#FSYNCMGR_FSYNC_REQUESTS_DESC

All group commit requests result in a group commit operation that flushes all dirty log buffers and the write queue using a file write. In addition, requests using SYNC durability will cause the group commit operation to include an fsync.

Because group commit operations are performed serially, while a group commit is executing in one thread, one or more other threads may be waiting to perform a group commit. The group commit mechanism works by forming a group containing the waiting threads. When the prior group commit is finished, a single group commit is performed on behalf of the new group in one of this group's threads, which is called the leader. The other threads in the group are called waiters and they proceed only after the leader has finished the group commit.

If a waiter thread waits longer than EnvironmentConfig#LOG_FSYNC_TIMEOUT for the leader to finish the group commit operation, the waiter will remove itself from the group and perform a group commit operation independently. The number of such timeouts is indicated by the following stat:

#getNFSyncTimeouts()

com.sleepycat.je.log.LogStatDefinition#FSYNCMGR_TIMEOUTS_DESC

The timeout is intended to prevent waiter threads from waiting indefinitely due to an unexpected problem. If #getNFSyncTimeouts()is consistently non-zero and the application is performing normally in other respects, consider increasing EnvironmentConfig#LOG_FSYNC_TIMEOUT.

The following stat indicates the number of group commit operations that included an fsync. There is currently no stat available indicating the number of group commit operations that did not include an fsync.

#getNFSyncs()

com.sleepycat.je.log.LogStatDefinition#FSYNCMGR_FSYNCS_DESC

Note that #getNFSyncs() is a subset of the #getNLogFSyncs() total that is described further above.

Node Compression Statistics

Group Name: com.sleepycat.je.incomp.INCompStatDefinition#GROUP_NAME
Description: com.sleepycat.je.incomp.INCompStatDefinition#GROUP_DESC