-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ManagedLedger] Pin executor and scheduled executor threads for ManagedLedgerImpl #11387
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,6 +57,8 @@ | |
import java.util.concurrent.ConcurrentSkipListMap; | ||
import java.util.concurrent.CountDownLatch; | ||
import java.util.concurrent.ExecutionException; | ||
import java.util.concurrent.Executor; | ||
import java.util.concurrent.ScheduledExecutorService; | ||
import java.util.concurrent.ScheduledFuture; | ||
import java.util.concurrent.TimeUnit; | ||
import java.util.concurrent.TimeoutException; | ||
|
@@ -78,7 +80,6 @@ | |
import org.apache.bookkeeper.client.LedgerHandle; | ||
import org.apache.bookkeeper.client.api.ReadHandle; | ||
import org.apache.bookkeeper.common.util.Backoff; | ||
import org.apache.bookkeeper.common.util.OrderedExecutor; | ||
import org.apache.bookkeeper.common.util.OrderedScheduler; | ||
import org.apache.bookkeeper.common.util.Retries; | ||
import org.apache.bookkeeper.mledger.AsyncCallbacks; | ||
|
@@ -252,7 +253,8 @@ public enum PositionBound { | |
protected volatile State state = null; | ||
|
||
private final OrderedScheduler scheduledExecutor; | ||
private final OrderedExecutor executor; | ||
private final ScheduledExecutorService pinnedScheduledExecutor; | ||
private final Executor pinnedExecutor; | ||
final ManagedLedgerFactoryImpl factory; | ||
protected final ManagedLedgerMBeanImpl mbean; | ||
protected final Clock clock; | ||
|
@@ -298,7 +300,8 @@ public ManagedLedgerImpl(ManagedLedgerFactoryImpl factory, BookKeeper bookKeeper | |
this.ledgerMetadata = LedgerMetadataUtils.buildBaseManagedLedgerMetadata(name); | ||
this.digestType = BookKeeper.DigestType.fromApiDigestType(config.getDigestType()); | ||
this.scheduledExecutor = scheduledExecutor; | ||
this.executor = bookKeeper.getMainWorkerPool(); | ||
this.pinnedScheduledExecutor = scheduledExecutor.chooseThread(name); | ||
this.pinnedExecutor = bookKeeper.getMainWorkerPool().chooseThread(name); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know why we never did this, but this saves a lot of string hashings too :) |
||
TOTAL_SIZE_UPDATER.set(this, 0); | ||
NUMBER_OF_ENTRIES_UPDATER.set(this, 0); | ||
ENTRIES_ADDED_COUNTER_UPDATER.set(this, 0); | ||
|
@@ -353,7 +356,7 @@ public void operationComplete(ManagedLedgerInfo mlInfo, Stat stat) { | |
if (ledgers.size() > 0) { | ||
final long id = ledgers.lastKey(); | ||
OpenCallback opencb = (rc, lh, ctx1) -> { | ||
executor.executeOrdered(name, safeRun(() -> { | ||
pinnedExecutor.execute(safeRun(() -> { | ||
mbean.endDataLedgerOpenOp(); | ||
if (log.isDebugEnabled()) { | ||
log.debug("[{}] Opened ledger {}: {}", name, id, BKException.getMessage(rc)); | ||
|
@@ -462,7 +465,7 @@ public void operationFailed(MetaStoreException e) { | |
return; | ||
} | ||
|
||
executor.executeOrdered(name, safeRun(() -> { | ||
pinnedExecutor.execute(safeRun(() -> { | ||
mbean.endDataLedgerCreateOp(); | ||
if (rc != BKException.Code.OK) { | ||
callback.initializeFailed(createManagedLedgerException(rc)); | ||
|
@@ -701,7 +704,7 @@ public void asyncAddEntry(ByteBuf buffer, AddEntryCallback callback, Object ctx) | |
OpAddEntry addOperation = OpAddEntry.create(this, buffer, callback, ctx); | ||
|
||
// Jump to specific thread to avoid contention from writers writing from different threads | ||
executor.executeOrdered(name, safeRun(() -> internalAsyncAddEntry(addOperation))); | ||
pinnedExecutor.execute(safeRun(() -> internalAsyncAddEntry(addOperation))); | ||
} | ||
|
||
@Override | ||
|
@@ -713,7 +716,7 @@ public void asyncAddEntry(ByteBuf buffer, int numberOfMessages, AddEntryCallback | |
OpAddEntry addOperation = OpAddEntry.create(this, buffer, numberOfMessages, callback, ctx); | ||
|
||
// Jump to specific thread to avoid contention from writers writing from different threads | ||
executor.executeOrdered(name, safeRun(() -> internalAsyncAddEntry(addOperation))); | ||
pinnedExecutor.execute(safeRun(() -> internalAsyncAddEntry(addOperation))); | ||
} | ||
|
||
private synchronized void internalAsyncAddEntry(OpAddEntry addOperation) { | ||
|
@@ -1481,7 +1484,7 @@ public void operationFailed(MetaStoreException e) { | |
private void updateLedgersListAfterRollover(MetaStoreCallback<Void> callback) { | ||
if (!metadataMutex.tryLock()) { | ||
// Defer update for later | ||
scheduledExecutor.schedule(() -> updateLedgersListAfterRollover(callback), 100, TimeUnit.MILLISECONDS); | ||
pinnedScheduledExecutor.schedule(() -> updateLedgersListAfterRollover(callback), 100, TimeUnit.MILLISECONDS); | ||
return; | ||
} | ||
|
||
|
@@ -1778,7 +1781,7 @@ CompletableFuture<ReadHandle> getLedgerHandle(long ledgerId) { | |
} | ||
promise.complete(res); | ||
} | ||
}, executor.chooseThread(name)); | ||
}, pinnedExecutor); | ||
return promise; | ||
}); | ||
} | ||
|
@@ -2159,7 +2162,7 @@ void notifyCursors() { | |
break; | ||
} | ||
|
||
executor.execute(safeRun(waitingCursor::notifyEntriesAvailable)); | ||
pinnedExecutor.execute(safeRun(waitingCursor::notifyEntriesAvailable)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this required to be on the same executor? We're notify multiple cursors that entries are available, this should be able to progress in parallel. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are 2 places will call the Another one is the ledger closed, looks only need to change here. |
||
} | ||
} | ||
|
||
|
@@ -2170,7 +2173,7 @@ void notifyWaitingEntryCallBacks() { | |
break; | ||
} | ||
|
||
executor.execute(safeRun(cb::entriesAvailable)); | ||
pinnedExecutor.execute(safeRun(cb::entriesAvailable)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same for this one, it should be same to spread into multiple threads. |
||
} | ||
} | ||
|
||
|
@@ -2217,15 +2220,16 @@ private void trimConsumedLedgersInBackground() { | |
|
||
@Override | ||
public void trimConsumedLedgersInBackground(CompletableFuture<?> promise) { | ||
executor.executeOrdered(name, safeRun(() -> internalTrimConsumedLedgers(promise))); | ||
pinnedExecutor.execute(safeRun(() -> internalTrimConsumedLedgers(promise))); | ||
} | ||
|
||
public void trimConsumedLedgersInBackground(boolean isTruncate, CompletableFuture<?> promise) { | ||
executor.executeOrdered(name, safeRun(() -> internalTrimLedgers(isTruncate, promise))); | ||
pinnedExecutor.execute(safeRun(() -> internalTrimLedgers(isTruncate, promise))); | ||
} | ||
|
||
private void scheduleDeferredTrimming(boolean isTruncate, CompletableFuture<?> promise) { | ||
scheduledExecutor.schedule(safeRun(() -> trimConsumedLedgersInBackground(isTruncate, promise)), 100, TimeUnit.MILLISECONDS); | ||
pinnedScheduledExecutor | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since |
||
.schedule(safeRun(() -> trimConsumedLedgersInBackground(isTruncate, promise)), 100, TimeUnit.MILLISECONDS); | ||
} | ||
|
||
private void maybeOffloadInBackground(CompletableFuture<PositionImpl> promise) { | ||
|
@@ -2234,13 +2238,13 @@ private void maybeOffloadInBackground(CompletableFuture<PositionImpl> promise) { | |
&& config.getLedgerOffloader().getOffloadPolicies() != null | ||
&& config.getLedgerOffloader().getOffloadPolicies().getManagedLedgerOffloadThresholdInBytes() != null | ||
&& config.getLedgerOffloader().getOffloadPolicies().getManagedLedgerOffloadThresholdInBytes() >= 0) { | ||
executor.executeOrdered(name, safeRun(() -> maybeOffload(promise))); | ||
pinnedExecutor.execute(safeRun(() -> maybeOffload(promise))); | ||
} | ||
} | ||
|
||
private void maybeOffload(CompletableFuture<PositionImpl> finalPromise) { | ||
if (!offloadMutex.tryLock()) { | ||
scheduledExecutor.schedule(safeRun(() -> maybeOffloadInBackground(finalPromise)), | ||
pinnedScheduledExecutor.schedule(safeRun(() -> maybeOffloadInBackground(finalPromise)), | ||
100, TimeUnit.MILLISECONDS); | ||
} else { | ||
CompletableFuture<PositionImpl> unlockingPromise = new CompletableFuture<>(); | ||
|
@@ -2660,7 +2664,7 @@ private void asyncDeleteLedger(long ledgerId, long retry) { | |
log.warn("[{}] Ledger was already deleted {}", name, ledgerId); | ||
} else if (rc != BKException.Code.OK) { | ||
log.error("[{}] Error deleting ledger {} : {}", name, ledgerId, BKException.getMessage(rc)); | ||
scheduledExecutor.schedule(safeRun(() -> asyncDeleteLedger(ledgerId, retry - 1)), DEFAULT_LEDGER_DELETE_BACKOFF_TIME_SEC, TimeUnit.SECONDS); | ||
pinnedScheduledExecutor.schedule(safeRun(() -> asyncDeleteLedger(ledgerId, retry - 1)), DEFAULT_LEDGER_DELETE_BACKOFF_TIME_SEC, TimeUnit.SECONDS); | ||
} else { | ||
if (log.isDebugEnabled()) { | ||
log.debug("[{}] Deleted ledger {}", name, ledgerId); | ||
|
@@ -2932,7 +2936,7 @@ private void tryTransformLedgerInfo(long ledgerId, LedgerInfoTransformation tran | |
synchronized (this) { | ||
if (!metadataMutex.tryLock()) { | ||
// retry in 100 milliseconds | ||
scheduledExecutor.schedule( | ||
pinnedScheduledExecutor.schedule( | ||
safeRun(() -> tryTransformLedgerInfo(ledgerId, transformation, finalPromise)), 100, | ||
TimeUnit.MILLISECONDS); | ||
} else { // lock acquired | ||
|
@@ -3412,12 +3416,12 @@ public NavigableMap<Long, LedgerInfo> getLedgersInfo() { | |
return ledgers; | ||
} | ||
|
||
OrderedScheduler getScheduledExecutor() { | ||
return scheduledExecutor; | ||
ScheduledExecutorService getPinnedScheduledExecutor() { | ||
return pinnedScheduledExecutor; | ||
} | ||
|
||
OrderedExecutor getExecutor() { | ||
return executor; | ||
Executor getPinnedExecutor() { | ||
return pinnedExecutor; | ||
} | ||
|
||
private ManagedLedgerInfo getManagedLedgerInfo() { | ||
|
@@ -3618,7 +3622,7 @@ protected void asyncCreateLedger(BookKeeper bookKeeper, ManagedLedgerConfig conf | |
cb.createComplete(Code.UnexpectedConditionException, null, ledgerCreated); | ||
return; | ||
} | ||
scheduledExecutor.schedule(() -> { | ||
pinnedScheduledExecutor.schedule(() -> { | ||
if (!ledgerCreated.get()) { | ||
if (log.isDebugEnabled()) { | ||
log.debug("[{}] Timeout creating ledger", name); | ||
|
@@ -3667,7 +3671,7 @@ private void scheduleTimeoutTask() { | |
timeoutSec = timeoutSec <= 0 | ||
? Math.max(config.getAddEntryTimeoutSeconds(), config.getReadEntryTimeoutSeconds()) | ||
: timeoutSec; | ||
this.timeoutTask = this.scheduledExecutor.scheduleAtFixedRate(safeRun(() -> { | ||
this.timeoutTask = this.pinnedScheduledExecutor.scheduleAtFixedRate(safeRun(() -> { | ||
checkAddTimeout(); | ||
checkReadTimeout(); | ||
}), timeoutSec, timeoutSec, TimeUnit.SECONDS); | ||
|
@@ -3808,7 +3812,7 @@ private void asyncUpdateProperties(Map<String, String> properties, boolean isDel | |
String deleteKey, final UpdatePropertiesCallback callback, Object ctx) { | ||
if (!metadataMutex.tryLock()) { | ||
// Defer update for later | ||
scheduledExecutor.schedule(() -> asyncUpdateProperties(properties, isDelete, deleteKey, | ||
pinnedScheduledExecutor.schedule(() -> asyncUpdateProperties(properties, isDelete, deleteKey, | ||
callback, ctx), 100, TimeUnit.MILLISECONDS); | ||
return; | ||
} | ||
|
@@ -3955,7 +3959,7 @@ private void updateLastLedgerCreatedTimeAndScheduleRolloverTask() { | |
// and the previous checkLedgerRollTask is not done, we could cancel it | ||
checkLedgerRollTask.cancel(true); | ||
} | ||
this.checkLedgerRollTask = this.scheduledExecutor.schedule( | ||
this.checkLedgerRollTask = this.pinnedScheduledExecutor.schedule( | ||
safeRun(this::rollCurrentLedgerIfFull), this.maximumRolloverTimeMs, TimeUnit.MILLISECONDS); | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -93,7 +93,7 @@ public void readEntriesFailed(ManagedLedgerException exception, Object ctx) { | |
|
||
if (!entries.isEmpty()) { | ||
// There were already some entries that were read before, we can return them | ||
cursor.ledger.getExecutor().execute(safeRun(() -> { | ||
cursor.ledger.getPinnedExecutor().execute(safeRun(() -> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should be careful in not serializing every cursor into the managed ledger pinned thread, as it could become a bottleneck where there are many cursors on a topic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes that's true. The reason to use the pinned executor is to adhere to Java Memory Model rules of correct synchronization. There's a generic problem in OpReadEntry since it's sharing an array that is mutated by multiple threads. JLS 17.4 explains that "Incorrectly Synchronized Programs May Exhibit Surprising Behavior". I would assume that "entries" would have to be copied to a new list before sharing if we want to use multiple threads. Is that right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the entry reading happens one by one, if we got the read entries failed here, this means we will not get a chance to add more elements to the list right(all the previous read operations are done)? |
||
callback.readEntriesComplete(entries, ctx); | ||
recycle(); | ||
})); | ||
|
@@ -141,8 +141,8 @@ void checkReadCompletion() { | |
cursor.ledger.startReadOperationOnLedger(nextReadPosition, OpReadEntry.this); | ||
} | ||
|
||
// Schedule next read in a different thread | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a behaviour change There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
testing, testing, testing. we need more Fallout tests. :) |
||
cursor.ledger.getExecutor().execute(safeRun(() -> { | ||
// Schedule next read | ||
cursor.ledger.getPinnedExecutor().execute(safeRun(() -> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Other than the consideration that different cursors shouldn't be pinned on a single thread, the reason for jumping to a different thread here is to avoid a stack overflow. When the read is being served from the ML cache, it's coming back from same thread. There are some conditions in which we ask for next read. eg. If you ask to read 100 entries and we only got 20 entries from current ledger, we'll schedule a read for the remaining 80 on next ledger. In some cases there could be abnormal distributions, like 1 entry per ledger and it would be chaining all the reads and callback within the same stack. Therefore, the "jump to a random thread" was introduced to break that chain. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wouldn't the usage of the pinned executor achieve the same result? It prevents the stack from going deeper and deeper. The only reason that comes into my mind is the case where there's a completable future that gets triggered as part of the call flow and it is being waited to complete in the same thread as where the result should be executed in. That would never complete and would dead lock. Would that be the case here to use a different executor? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like the stack should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
@lhotari Uhm, I think that some executors are short-circuiting the queue if they detect that you're trying to add a task from the same executor thread. That is the case for Netty IO thread, though I just check that it shouldn't happen on the |
||
readPosition = cursor.ledger.startReadOperationOnLedger(nextReadPosition, OpReadEntry.this); | ||
cursor.ledger.asyncReadEntries(OpReadEntry.this); | ||
})); | ||
|
@@ -152,7 +152,7 @@ void checkReadCompletion() { | |
cursor.readOperationCompleted(); | ||
|
||
} finally { | ||
cursor.ledger.getExecutor().executeOrdered(cursor.ledger.getName(), safeRun(() -> { | ||
cursor.ledger.getPinnedExecutor().execute(safeRun(() -> { | ||
callback.readEntriesComplete(entries, ctx); | ||
recycle(); | ||
})); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we are using 2 threads, one with the regular executor (which is more efficient) and the other for the
pinnedScheduledExecutor
, wouldn't that mean that we still have more than 1 thread accessing some of the objects?Would it make sense to use the generic
scheduledExecutor
(just for deferring purposes) and then jump back into the samepinnedExecutor
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's true.
Perhaps a more optimal solution would be to have the capability for scheduling tasks on the pinned scheduler. I don't know why this solution isn't available in the underlying Bookkeeper libraries that are used. The benefit of that is that there isn't an additional thread switch when the scheduled task triggers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@merlimat Do you mean scheduledExecutor.schedule(pinnedExecutor.execute() ...) ?
Seems to be a feasible way right now :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@lhotari The scheduled executor is less efficient compared to the normal executor because it has to maintain the delayed tasks. For that it's preferable not to use it directly in the critical data path, but only when we want to defer actions or for background tasks.