Phase 13: batch RS2 disk saves + force-claim ghost sessions after 15s
Two targeted fixes based on the 2026-04-22 06:26+ production log run.
(1) RS2 disk writes: one batched transaction instead of N sequential REPLACE INTOs
Every logout [perf-logout] line showed the same pattern:
core=72ms backpacks=6ms ss=5ms rs2=523ms total=606ms
core=56ms backpacks=4ms ss=1ms rs2=391ms total=452ms
core=77ms backpacks=3ms ss=1ms rs2=409ms total=490ms
RS2 dominated the save path. Backpacks + SS were already batched via
saveBackpackSnapshots since Phase 7, but saveRS2DisksByLevel still
looped saveStorageContents (one REPLACE INTO per disk).
Fix: collect every disk's NBT into Map<UUID, CompoundTag> first, then
delegate to saveBackpackSnapshots (same table, same batched transaction
path with per-entry fallback on failure). Expected ~10x reduction in
rs2= duration for players with 3-4 disks.
(2) Ghost-session force-claim: absolute 15s cap instead of stale-heartbeat-only
Fresh field logs showed the exact scenario Phase 10 left unsolved:
06:26:43 RESTORE started for 95d0db86
06:27:44 RESTORE completed in 60627ms (full poll timeout)
06:58:16 RESTORE started for 5d582bbc
06:59:17 RESTORE completed in 61630ms (full poll timeout)
The peer's heartbeat was always fresh (age 2-28s, well under the 60s
stale threshold), so Phase 11's 'only force-claim if stale' gate never
fired — the loop ran the full 120 attempts. Meanwhile [perf-logout]
proves real saves commit in < 1s, so a peer that hasn't flushed after
15s is a ghost session (player disconnected uncleanly, flag stuck at
online=1). Waiting another 45s for a save that isn't coming is pure
UX cost.
Fix: after join_peer_alive_max_wait_seconds (default raised from 5 to
15), force-claim unconditionally. Safe because:
- 15s is 15x the max observed save time — real saves are always
committed to DB by then.
- Phase 2's last_server guard already blocks any late write from
the ghost session (the guard logs [GUARD] on the peer's side).
- Phase 10 duplication scenario (force-claim before peer's async
save commits) can no longer happen with this safer threshold.
Peer-truly-stale short-circuit (heartbeat > 60s old) still triggers
instantly via the isPeerServerStale() check at the top of the loop —
only the 'peer alive but player ghost' path changed semantics.
This commit is contained in:
parent
61e6394efe
commit
fa7033fdea
|
|
@ -178,13 +178,12 @@ public class JdbcConfig {
|
|||
JOIN_PEER_ALIVE_MAX_WAIT_SECONDS = B.comment(
|
||||
"When the previous server is ALIVE (heartbeat fresh) but the player row still",
|
||||
"shows online=1 on it, how long to wait before force-claiming ownership on this",
|
||||
"server. Prevents the 30-60s 'empty inventory' window when a player active on",
|
||||
"peer A connects to peer B without cleanly logging out (proxy, network drop,",
|
||||
"dup session). After this timeout, peer A will simply fail to save this player",
|
||||
"(blocked by last_server guard) and their next disconnect won't overwrite B's",
|
||||
"data. Default 5s. Set to 0 to force-claim immediately; set high to restore the",
|
||||
"legacy behavior of waiting for the peer to flush.")
|
||||
.defineInRange("join_peer_alive_max_wait_seconds", 5, 0, 600);
|
||||
"server. Ghost sessions (network drop, proxy bypass, stuck flag) otherwise hold",
|
||||
"the join hostage up to 60s. Real logout saves consistently complete in <1s in",
|
||||
"production, so any wait > ~15s means the peer isn't going to flush — force-",
|
||||
"claim is safe because peer's future saves get blocked by the last_server guard.",
|
||||
"Default 15s. Set to 0 to force-claim immediately; set high to be more patient.")
|
||||
.defineInRange("join_peer_alive_max_wait_seconds", 15, 0, 600);
|
||||
POOL_STATS_INTERVAL_MINUTES = B.comment(
|
||||
"How often PoolStatsReporter logs executor + Hikari stats. 0 to disable.")
|
||||
.defineInRange("pool_stats_interval_minutes", 5, 0, 1440);
|
||||
|
|
|
|||
|
|
@ -457,32 +457,37 @@ public class VanillaSync {
|
|||
// atomic), just maybe slowly on a heavy-load server.
|
||||
long waitedMs = System.currentTimeMillis() - pollStartTime;
|
||||
if (PEER_ALIVE_MAX_WAIT_MS > 0 && waitedMs >= PEER_ALIVE_MAX_WAIT_MS) {
|
||||
// PHASE 11 FIX: compare peer heartbeat age against the stale
|
||||
// threshold (default 60s), NOT the 5s wait cap. A normal peer
|
||||
// with a 30s heartbeat interval naturally has an age of 0-30s
|
||||
// between ticks — using 5s as the "frozen" threshold caused
|
||||
// EVERY cross-server join to misclassify a healthy peer as
|
||||
// frozen and force-claim unnecessarily (observed in prod logs:
|
||||
// 'heartbeat frozen 5380ms').
|
||||
// PHASE 13: absolute wait cap as force-claim trigger.
|
||||
// Real logout saves complete in <1s in production (measured via
|
||||
// Phase 10 [perf-logout] breakdown: core+bp+ss+rs2 always 400-
|
||||
// 600ms). A wait of 15s+ with player still online=1 means the
|
||||
// peer is NOT going to flush — it's a ghost session (network
|
||||
// drop, proxy bypass, stuck flag). Keeping users hostage for the
|
||||
// full 60s poll was the reported 20s-60s join latency.
|
||||
//
|
||||
// Force-claim is safe here because:
|
||||
// 1. Phase 10 duplication risk was 'force-claim BEFORE peer's
|
||||
// async save commits' — but 15s is 15x the max observed
|
||||
// save time, so the peer has either committed or never
|
||||
// will.
|
||||
// 2. writeSnapshotToDB's last_server guard blocks any future
|
||||
// write from the peer for this player — their ghost
|
||||
// session's final save (if it ever comes) silently fails
|
||||
// and logs [GUARD] instead of overwriting our data.
|
||||
//
|
||||
// A separately-stale heartbeat still short-circuits immediately
|
||||
// (handled above by isPeerServerStale at the start of the loop).
|
||||
long peerAgeMs = peerHeartbeatAgeMs(otherServer);
|
||||
if (peerAgeMs > STALE_HEARTBEAT_MS) {
|
||||
SyncLogger.raceCondition(player_uuid,
|
||||
"Peer " + otherServer + " heartbeat stale " + peerAgeMs
|
||||
+ "ms > " + STALE_HEARTBEAT_MS + "ms, waited " + waitedMs + "ms — force-claiming");
|
||||
PlayerSync.LOGGER.warn(
|
||||
"Player {} waited {}ms for peer {} whose heartbeat is {}ms old (threshold {}ms) — force-claiming",
|
||||
player_uuid, waitedMs, otherServer, peerAgeMs, STALE_HEARTBEAT_MS);
|
||||
JDBCsetUp.executePreparedUpdate(
|
||||
"UPDATE " + Tables.playerData() + " SET online=0 WHERE uuid=? AND last_server=?",
|
||||
player_uuid, otherServer);
|
||||
break;
|
||||
}
|
||||
// Peer is actively heartbeating but slow to flush — keep waiting.
|
||||
// Warn sparingly to avoid log flood.
|
||||
if (attempt % 20 == 0) {
|
||||
SyncLogger.warnPlayer(player_uuid,
|
||||
"peer " + otherServer + " healthy but slow to flush (waited=" + waitedMs + "ms, hb_age=" + peerAgeMs + "ms)");
|
||||
}
|
||||
SyncLogger.raceCondition(player_uuid,
|
||||
"Peer " + otherServer + " ghost session suspected — waited "
|
||||
+ waitedMs + "ms (hb_age=" + peerAgeMs + "ms), force-claiming");
|
||||
PlayerSync.LOGGER.warn(
|
||||
"Player {} force-claiming from peer {} after {}ms wait (hb_age={}ms) — ghost session (peer won't flush)",
|
||||
player_uuid, otherServer, waitedMs, peerAgeMs);
|
||||
JDBCsetUp.executePreparedUpdate(
|
||||
"UPDATE " + Tables.playerData() + " SET online=0 WHERE uuid=? AND last_server=?",
|
||||
player_uuid, otherServer);
|
||||
break;
|
||||
}
|
||||
// PHASE 11: log RACE only every 10 attempts instead of every tick.
|
||||
// Previous behavior produced up to 120 lines per cross-server join,
|
||||
|
|
|
|||
|
|
@ -1156,13 +1156,21 @@ public class ModsSupport {
|
|||
|
||||
net.minecraft.nbt.CompoundTag fullNbt = sd.save(new net.minecraft.nbt.CompoundTag(), registryAccess);
|
||||
|
||||
// PHASE 13 PERF: collect all disk NBTs into a single Map and delegate to the
|
||||
// batched writer. Previous behavior made N sequential REPLACE INTO calls
|
||||
// (observed as rs2=500ms+ in [perf-logout] breakdowns). One batched transaction
|
||||
// now dominates by ~10× for players with multiple disks.
|
||||
Map<UUID, CompoundTag> toSave = new HashMap<>();
|
||||
for (UUID uuid : diskUuids) {
|
||||
net.minecraft.nbt.CompoundTag entryNbt = findRS2EntryInNbt(fullNbt, uuid.toString());
|
||||
if (entryNbt != null && !entryNbt.isEmpty()) {
|
||||
saveStorageContents(uuid, entryNbt);
|
||||
PlayerSync.LOGGER.info("Saved RS2 disk data for UUID {} (async save)", uuid);
|
||||
toSave.put(uuid, entryNbt);
|
||||
}
|
||||
}
|
||||
if (!toSave.isEmpty()) {
|
||||
saveBackpackSnapshots(toSave); // shared batched writer (backpack_data table)
|
||||
PlayerSync.LOGGER.info("Saved {} RS2 disk(s) in one batch", toSave.size());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
PlayerSync.LOGGER.error("Error saving RS2 disks by level", e);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user