Description
Describe the bug
While running some perf tests for Segment Replication I hit an Exception while computing a metadata snapshot off of the primary's latest segment infos. I think this is caused by the primary missing a segments_N file on disk that is currently referenced by the latest SegmentInfos returned from getLatestSegmentInfos. The exception is hit
Segment Infos files: [_1.cfs, _0.cfe, _0.si, _1.cfe, _1.si, _2.si, _0.cfs, _2.cfe, _2.cfs, segments_2]
On disk files: [_0.cfe, _0.cfs, _0.si, _1.cfe, _1.cfs, _1.si, _2.cfe, _2.cfs, _2.si, segments_3, write.lock]
This means the primary needs to refresh its reader before we compute the metadata or we could hit this situation.
Further, we are invoking indexWriter.incRefDeleter while fetching the latest SegmentInfos and returning it as a closeable to release the files. However, incRefDeleter
does not incref the segments_N file. - https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java#L5831. This means that this file could be merged away before we finish copying it to a replica.
Trace:
022-08-09T18:10:36,763][WARN ][o.o.c.r.a.AllocationService] [node-3] failing shard [failed shard, shard [so][2], node[QKw3L21KRQeGJ8tmF1ENpQ], [R], s[STARTED], a[id=7ETNwg5VQ9GEpOPHnX1UXw], message [shard failure, reason [replication failure]], failure [OpenSearchException[Segment Replication failed]; nested: RemoteTransportException[[node-3][:9300][internal:index/shard/replication/get_checkpoint_info]]; nested: NoSuchFileException[/home/ec2-user/opensearch-3.0.0/data/nodes/0/indices/sBLLTSH0QtqQ5OVQKJqUDQ/2/index/segments_2]; ], markAsStale [true]]
org.opensearch.OpenSearchException: Segment Replication failed
at org.opensearch.indices.replication.SegmentReplicationTargetService$3.onFailure(SegmentReplicationTargetService.java:235) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.action.ActionListener$1.onFailure(ActionListener.java:88) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.action.ActionRunnable.onFailure(ActionRunnable.java:103) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:54) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.common.util.concurrent.OpenSearchExecutors$DirectExecutorService.execute(OpenSearchExecutors.java:341) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.common.util.concurrent.ListenableFuture.notifyListener(ListenableFuture.java:120) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.common.util.concurrent.ListenableFuture.lambda$done$0(ListenableFuture.java:112) ~[opensearch-3.0.0.jar:3.0.0]
at java.util.ArrayList.forEach(ArrayList.java:1511) ~[?:?]
at org.opensearch.common.util.concurrent.ListenableFuture.done(ListenableFuture.java:112) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.common.util.concurrent.BaseFuture.setException(BaseFuture.java:178) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.common.util.concurrent.ListenableFuture.onFailure(ListenableFuture.java:149) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.action.StepListener.innerOnFailure(StepListener.java:82) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.action.NotifyOnceListener.onFailure(NotifyOnceListener.java:62) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.action.ActionListener$4.onFailure(ActionListener.java:190) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.action.ActionListener$6.onFailure(ActionListener.java:309) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.action.support.RetryableAction$RetryingListener.onFinalFailure(RetryableAction.java:201) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.action.support.RetryableAction$RetryingListener.onFailure(RetryableAction.java:193) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.action.ActionListenerResponseHandler.handleException(ActionListenerResponseHandler.java:74) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.transport.TransportService$ContextRestoreResponseHandler.handleException(TransportService.java:1370) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.transport.InboundHandler.lambda$handleException$3(InboundHandler.java:420) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.common.util.concurrent.ThreadContext$ContextPreservingRunnable.run(ThreadContext.java:747) ~[opensearch-3.0.0.jar:3.0.0]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) [?:?]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) [?:?]
at java.lang.Thread.run(Thread.java:833) [?:?]
Caused by: org.opensearch.transport.RemoteTransportException: [node-3][:9300][internal:index/shard/replication/get_checkpoint_info]
Caused by: java.nio.file.NoSuchFileException: /home/ec2-user/opensearch-3.0.0/data/nodes/0/indices/sBLLTSH0QtqQ5OVQKJqUDQ/2/index/segments_2
at sun.nio.fs.UnixException.translateToIOException(UnixException.java:92) ~[?:?]
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:106) ~[?:?]
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:111) ~[?:?]
at sun.nio.fs.UnixFileSystemProvider.newFileChannel(UnixFileSystemProvider.java:181) ~[?:?]
at java.nio.channels.FileChannel.open(FileChannel.java:298) ~[?:?]
at java.nio.channels.FileChannel.open(FileChannel.java:357) ~[?:?]
at org.apache.lucene.store.NIOFSDirectory.openInput(NIOFSDirectory.java:78) ~[lucene-core-9.3.0.jar:9.3.0 d25cebcef7a80369f4dfb9285ca7360a810b75dc - ivera - 2022-07-25 12:30:23]
at org.opensearch.index.store.FsDirectoryFactory$HybridDirectory.openInput(FsDirectoryFactory.java:166) ~[opensearch-3.0.0.jar:3.0.0]
at org.apache.lucene.store.FilterDirectory.openInput(FilterDirectory.java:101) ~[lucene-core-9.3.0.jar:9.3.0 d25cebcef7a80369f4dfb9285ca7360a810b75dc - ivera - 2022-07-25 12:30:23]
at org.apache.lucene.store.FilterDirectory.openInput(FilterDirectory.java:101) ~[lucene-core-9.3.0.jar:9.3.0 d25cebcef7a80369f4dfb9285ca7360a810b75dc - ivera - 2022-07-25 12:30:23]
at org.opensearch.index.store.Store$MetadataSnapshot.checksumFromLuceneFile(Store.java:1046) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.index.store.Store$MetadataSnapshot.loadMetadata(Store.java:1032) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.index.store.Store$MetadataSnapshot.<init>(Store.java:895) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.index.store.Store.getMetadata(Store.java:333) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.indices.replication.common.CopyState.<init>(CopyState.java:58) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.indices.replication.OngoingSegmentReplications.getCachedCopyState(OngoingSegmentReplications.java:83) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.indices.replication.OngoingSegmentReplications.prepareForReplication(OngoingSegmentReplications.java:158) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.indices.replication.SegmentReplicationSourceService$CheckpointInfoRequestHandler.messageReceived(SegmentReplicationSourceService.java:103) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.indices.replication.SegmentReplicationSourceService$CheckpointInfoRequestHandler.messageReceived(SegmentReplicationSourceService.java:86) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:106) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.transport.InboundHandler$RequestHandler.doRun(InboundHandler.java:453) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:806) ~[opensearch-3.0.0.jar:3.0.0]
at org.opensearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:52) ~[opensearch-3.0.0.jar:3.0.0]
... 3 more