long getGcDeletesInMillis() { return engineConfig.getIndexSettings().getGcDeletesInMillis(); }
private SoftDeletesPolicy newSoftDeletesPolicy() throws IOException { final Map<String, String> commitUserData = store.readLastCommittedSegmentsInfo().userData; final long lastMinRetainedSeqNo; if (commitUserData.containsKey(Engine.MIN_RETAINED_SEQNO)) { lastMinRetainedSeqNo = Long.parseLong(commitUserData.get(Engine.MIN_RETAINED_SEQNO)); } else { lastMinRetainedSeqNo = Long.parseLong(commitUserData.get(SequenceNumbers.MAX_SEQ_NO)) + 1; } return new SoftDeletesPolicy(translog::getLastSyncedGlobalCheckpoint, lastMinRetainedSeqNo, engineConfig.getIndexSettings().getSoftDeleteRetentionOperations()); }
@Override public Translog.Snapshot newChangesSnapshot(String source, MapperService mapperService, long fromSeqNo, long toSeqNo, boolean requiredFullRange) throws IOException { if (engineConfig.getIndexSettings().isSoftDeleteEnabled() == false) { throw new IllegalStateException("accessing changes snapshot requires soft-deletes enabled"); } return readHistoryOperations(source, mapperService, fromSeqNo); }
private boolean assertSequenceNumberBeforeIndexing(final Engine.Operation.Origin origin, final long seqNo) { if (engineConfig.getIndexSettings().getIndexVersionCreated().onOrAfter(Version.V_6_0_0_alpha1) || origin == Operation.Origin.PRIMARY) { // sequence number should be set when operation origin is primary or when all shards are on new nodes assert seqNo >= 0 : "ops should have an assigned seq no.; origin: " + origin; } return true; }
private boolean assertIncomingSequenceNumber(final Engine.Operation.Origin origin, final long seqNo) { if (engineConfig.getIndexSettings().getIndexVersionCreated().before(Version.V_6_0_0_alpha1) && origin == Operation.Origin.LOCAL_TRANSLOG_RECOVERY) { // legacy support assert seqNo == SequenceNumbers.UNASSIGNED_SEQ_NO : "old op recovering but it already has a seq no.;" + " index version: " + engineConfig.getIndexSettings().getIndexVersionCreated() + ", seqNo: " + seqNo; } else if (origin == Operation.Origin.PRIMARY) { assertPrimaryIncomingSequenceNumber(origin, seqNo); } else if (engineConfig.getIndexSettings().getIndexVersionCreated().onOrAfter(Version.V_6_0_0_alpha1)) { // sequence number should be set when operation origin is not primary assert seqNo >= 0 : "recovery or replica ops should have an assigned seq no.; origin: " + origin; } return true; }
/** * Creates a new history snapshot for reading operations since the provided seqno. * The returned snapshot can be retrieved from either Lucene index or translog files. */ @Override public Translog.Snapshot readHistoryOperations(String source, MapperService mapperService, long startingSeqNo) throws IOException { if (engineConfig.getIndexSettings().isSoftDeleteEnabled()) { return newChangesSnapshot(source, mapperService, Math.max(0, startingSeqNo), Long.MAX_VALUE, false); } else { return getTranslog().newSnapshotFromMinSeqNo(startingSeqNo); } }
protected final DirectoryReader wrapReader(DirectoryReader reader, Function<DirectoryReader, DirectoryReader> readerWrapperFunction) throws IOException { reader = ElasticsearchDirectoryReader.wrap(reader, engineConfig.getShardId()); if (engineConfig.getIndexSettings().isSoftDeleteEnabled()) { reader = new SoftDeletesDirectoryReaderWrapper(reader, Lucene.SOFT_DELETES_FIELD); } return readerWrapperFunction.apply(reader); }
@Override public InternalEngine recoverFromTranslog(TranslogRecoveryRunner translogRecoveryRunner, long recoverUpToSeqNo) throws IOException { flushLock.lock(); try (ReleasableLock lock = readLock.acquire()) { ensureOpen(); assert getMaxSeqNoOfUpdatesOrDeletes() != SequenceNumbers.UNASSIGNED_SEQ_NO || engineConfig.getIndexSettings().getIndexVersionCreated().before(Version.V_6_0_0) : "max_seq_no_of_updates is uninitialized"; if (pendingTranslogRecovery.get() == false) { throw new IllegalStateException("Engine has already been recovered"); } try { recoverFromTranslogInternal(translogRecoveryRunner, recoverUpToSeqNo); } catch (Exception e) { try { pendingTranslogRecovery.set(true); // just play safe and never allow commits on this see #ensureCanFlush failEngine("failed to recover from translog", e); } catch (Exception inner) { e.addSuppressed(inner); } throw e; } } finally { flushLock.unlock(); } return this; }
private static LocalCheckpointTracker createLocalCheckpointTracker(EngineConfig engineConfig, SegmentInfos lastCommittedSegmentInfos, Logger logger, Supplier<Searcher> searcherSupplier, BiFunction<Long, Long, LocalCheckpointTracker> localCheckpointTrackerSupplier) { try { final SequenceNumbers.CommitInfo seqNoStats = SequenceNumbers.loadSeqNoInfoFromLuceneCommit(lastCommittedSegmentInfos.userData.entrySet()); final long maxSeqNo = seqNoStats.maxSeqNo; final long localCheckpoint = seqNoStats.localCheckpoint; logger.trace("recovered maximum sequence number [{}] and local checkpoint [{}]", maxSeqNo, localCheckpoint); final LocalCheckpointTracker tracker = localCheckpointTrackerSupplier.apply(maxSeqNo, localCheckpoint); // Operations that are optimized using max_seq_no_of_updates optimization must not be processed twice; otherwise, they will // create duplicates in Lucene. To avoid this we check the LocalCheckpointTracker to see if an operation was already processed. // Thus, we need to restore the LocalCheckpointTracker bit by bit to ensure the consistency between LocalCheckpointTracker and // Lucene index. This is not the only solution since we can bootstrap max_seq_no_of_updates with max_seq_no of the commit to // disable the MSU optimization during recovery. Here we prefer to maintain the consistency of LocalCheckpointTracker. if (localCheckpoint < maxSeqNo && engineConfig.getIndexSettings().isSoftDeleteEnabled()) { try (Searcher searcher = searcherSupplier.get()) { Lucene.scanSeqNosInReader(searcher.getDirectoryReader(), localCheckpoint + 1, maxSeqNo, tracker::markSeqNoAsCompleted); } } return tracker; } catch (IOException ex) { throw new EngineCreationFailureException(engineConfig.getShardId(), "failed to create local checkpoint tracker", ex); } }
/** * Returns the estimated number of history operations whose seq# at least the provided seq# in this engine. */ @Override public int estimateNumberOfHistoryOperations(String source, MapperService mapperService, long startingSeqNo) throws IOException { if (engineConfig.getIndexSettings().isSoftDeleteEnabled()) { try (Translog.Snapshot snapshot = newChangesSnapshot(source, mapperService, Math.max(0, startingSeqNo), Long.MAX_VALUE, false)) { return snapshot.totalOperations(); } } else { return getTranslog().estimateTotalOperationsFromMinSeq(startingSeqNo); } }
private Translog openTranslog(EngineConfig engineConfig, TranslogDeletionPolicy translogDeletionPolicy, LongSupplier globalCheckpointSupplier) throws IOException { final TranslogConfig translogConfig = engineConfig.getTranslogConfig(); final String translogUUID = loadTranslogUUIDFromLastCommit(); // A translog checkpoint from 5.x index does not have translog_generation_key and Translog's ctor will read translog gen values // from translogDeletionPolicy. We need to bootstrap these values from the recovering commit before calling Translog ctor. if (engineConfig.getIndexSettings().getIndexVersionCreated().before(Version.V_6_0_0)) { final SegmentInfos lastCommitInfo = store.readLastCommittedSegmentsInfo(); final long minRequiredTranslogGen = Long.parseLong(lastCommitInfo.userData.get(Translog.TRANSLOG_GENERATION_KEY)); translogDeletionPolicy.setTranslogGenerationOfLastCommit(minRequiredTranslogGen); translogDeletionPolicy.setMinTranslogGenerationForRecovery(minRequiredTranslogGen); } // We expect that this shard already exists, so it must already have an existing translog else something is badly wrong! return new Translog(translogConfig, translogUUID, translogDeletionPolicy, globalCheckpointSupplier, engineConfig.getPrimaryTermSupplier()); }
private void pruneDeletedTombstones() { /* * We need to deploy two different trimming strategies for GC deletes on primary and replicas. Delete operations on primary * are remembered for at least one GC delete cycle and trimmed periodically. This is, at the moment, the best we can do on * primary for user facing APIs but this arbitrary time limit is problematic for replicas. On replicas however we should * trim only deletes whose seqno at most the local checkpoint. This requirement is explained as follows. * * Suppose o1 and o2 are two operations on the same document with seq#(o1) < seq#(o2), and o2 arrives before o1 on the replica. * o2 is processed normally since it arrives first; when o1 arrives it should be discarded: * - If seq#(o1) <= LCP, then it will be not be added to Lucene, as it was already previously added. * - If seq#(o1) > LCP, then it depends on the nature of o2: * *) If o2 is a delete then its seq# is recorded in the VersionMap, since seq#(o2) > seq#(o1) > LCP, * so a lookup can find it and determine that o1 is stale. * *) If o2 is an indexing then its seq# is either in Lucene (if refreshed) or the VersionMap (if not refreshed yet), * so a real-time lookup can find it and determine that o1 is stale. * * Here we prefer to deploy a single trimming strategy, which satisfies two constraints, on both primary and replicas because: * - It's simpler - no need to distinguish if an engine is running at primary mode or replica mode or being promoted. * - If a replica subsequently is promoted, user experience is maintained as that replica remembers deletes for the last GC cycle. * * However, the version map may consume less memory if we deploy two different trimming strategies for primary and replicas. */ final long timeMSec = engineConfig.getThreadPool().relativeTimeInMillis(); final long maxTimestampToPrune = timeMSec - engineConfig.getIndexSettings().getGcDeletesInMillis(); versionMap.pruneTombstones(maxTimestampToPrune, localCheckpointTracker.getCheckpoint()); lastDeleteVersionPruneTimeMSec = timeMSec; }
@Override public boolean shouldPeriodicallyFlush() { ensureOpen(); final long translogGenerationOfLastCommit = Long.parseLong(lastCommittedSegmentInfos.userData.get(Translog.TRANSLOG_GENERATION_KEY)); final long flushThreshold = config().getIndexSettings().getFlushThresholdSize().getBytes(); if (translog.sizeInBytesByMinGen(translogGenerationOfLastCommit) < flushThreshold) { return false; } /* * We flush to reduce the size of uncommitted translog but strictly speaking the uncommitted size won't always be * below the flush-threshold after a flush. To avoid getting into an endless loop of flushing, we only enable the * periodically flush condition if this condition is disabled after a flush. The condition will change if the new * commit points to the later generation the last commit's(eg. gen-of-last-commit < gen-of-new-commit)[1]. * * When the local checkpoint equals to max_seqno, and translog-gen of the last commit equals to translog-gen of * the new commit, we know that the last generation must contain operations because its size is above the flush * threshold and the flush-threshold is guaranteed to be higher than an empty translog by the setting validation. * This guarantees that the new commit will point to the newly rolled generation. In fact, this scenario only * happens when the generation-threshold is close to or above the flush-threshold; otherwise we have rolled * generations as the generation-threshold was reached, then the first condition (eg. [1]) is already satisfied. * * This method is to maintain translog only, thus IndexWriter#hasUncommittedChanges condition is not considered. */ final long translogGenerationOfNewCommit = translog.getMinGenerationForSeqNo(localCheckpointTracker.getCheckpoint() + 1).translogFileGeneration; return translogGenerationOfLastCommit < translogGenerationOfNewCommit || localCheckpointTracker.getCheckpoint() == localCheckpointTracker.getMaxSeqNo(); }
private boolean assertMaxSeqNoOfUpdatesIsAdvanced(Term id, long seqNo, boolean allowDeleted, boolean relaxIfGapInSeqNo) { final long maxSeqNoOfUpdates = getMaxSeqNoOfUpdatesOrDeletes(); // If the primary is on an old version which does not replicate msu, we need to relax this assertion for that. if (maxSeqNoOfUpdates == SequenceNumbers.UNASSIGNED_SEQ_NO) { assert config().getIndexSettings().getIndexVersionCreated().before(Version.V_6_5_0); return true; } // We treat a delete on the tombstones on replicas as a regular document, then use updateDocument (not addDocument). if (allowDeleted) { final VersionValue versionValue = versionMap.getVersionForAssert(id.bytes()); if (versionValue != null && versionValue.isDelete()) { return true; } } // Operations can be processed on a replica in a different order than on the primary. If the order on the primary is index-1, // delete-2, index-3, and the order on a replica is index-1, index-3, delete-2, then the msu of index-3 on the replica is 2 // even though it is an update (overwrites index-1). We should relax this assertion if there is a pending gap in the seq_no. if (relaxIfGapInSeqNo && getLocalCheckpoint() < maxSeqNoOfUpdates) { return true; } assert seqNo <= maxSeqNoOfUpdates : "id=" + id + " seq_no=" + seqNo + " msu=" + maxSeqNoOfUpdates; return true; }
updateAutoIdTimestamp(Long.MAX_VALUE, true); this.uidField = engineConfig.getIndexSettings().isSingleType() ? IdFieldMapper.NAME : UidFieldMapper.NAME; final TranslogDeletionPolicy translogDeletionPolicy = new TranslogDeletionPolicy( engineConfig.getIndexSettings().getTranslogRetentionSize().getBytes(), engineConfig.getIndexSettings().getTranslogRetentionAge().getMillis() ); store.incRef(); this.lastDeleteVersionPruneTimeMSec = engineConfig.getThreadPool().relativeTimeInMillis(); mergeScheduler = scheduler = new EngineMergeScheduler(engineConfig.getShardId(), engineConfig.getIndexSettings()); throttle = new IndexThrottle(); try { assert translog.getGeneration() != null; this.translog = translog; this.softDeleteEnabled = engineConfig.getIndexSettings().isSoftDeleteEnabled(); this.softDeletesPolicy = newSoftDeletesPolicy(); this.combinedDeletionPolicy =
@Override public boolean hasCompleteOperationHistory(String source, MapperService mapperService, long startingSeqNo) throws IOException { if (engineConfig.getIndexSettings().isSoftDeleteEnabled()) { return getMinRetainedSeqNo() <= startingSeqNo; } else { final long currentLocalCheckpoint = getLocalCheckpointTracker().getCheckpoint(); final LocalCheckpointTracker tracker = new LocalCheckpointTracker(startingSeqNo, startingSeqNo - 1); try (Translog.Snapshot snapshot = getTranslog().newSnapshotFromMinSeqNo(startingSeqNo)) { Translog.Operation operation; while ((operation = snapshot.next()) != null) { if (operation.seqNo() != SequenceNumbers.UNASSIGNED_SEQ_NO) { tracker.markSeqNoAsCompleted(operation.seqNo()); } } } return tracker.getCheckpoint() >= currentLocalCheckpoint; } }
public void onSettingsChanged() { mergeScheduler.refreshConfig(); // config().isEnableGcDeletes() or config.getGcDeletesInMillis() may have changed: maybePruneDeletes(); if (engineConfig.isAutoGeneratedIDsOptimizationEnabled() == false) { // this is an anti-viral settings you can only opt out for the entire index // only if a shard starts up again due to relocation or if the index is closed // the setting will be re-interpreted if it's set to true updateAutoIdTimestamp(Long.MAX_VALUE, true); } final TranslogDeletionPolicy translogDeletionPolicy = translog.getDeletionPolicy(); final IndexSettings indexSettings = engineConfig.getIndexSettings(); translogDeletionPolicy.setRetentionAgeInMillis(indexSettings.getTranslogRetentionAge().getMillis()); translogDeletionPolicy.setRetentionSizeInBytes(indexSettings.getTranslogRetentionSize().getBytes()); softDeletesPolicy.setRetentionOperations(indexSettings.getSoftDeleteRetentionOperations()); }
assert config().getIndexSettings().getIndexVersionCreated().before(Version.V_6_0_0_alpha1) : "index is newly created but op has no sequence numbers. op: " + delete; opVsLucene = compareOpToLuceneDocBasedOnVersions(delete);
.getIndexSettings().getIndexVersionCreated()); return new GetResult(new Searcher("realtime_get", new IndexSearcher(reader), reader::close), new VersionsAndSeqNoResolver.DocIdAndVersion(0, index.version(), index.seqNo(), index.primaryTerm(),
assert config().getIndexSettings().getIndexVersionCreated().before(Version.V_6_0_0_alpha1) : "index is newly created but op has no sequence numbers. op: " + index; opVsLucene = compareOpToLuceneDocBasedOnVersions(index);