From fa7033fdea214d1cb2a0e81036be38aea5460a4e Mon Sep 17 00:00:00 2001 From: laforetbrut Date: Wed, 22 Apr 2026 09:04:53 +0200 Subject: [PATCH] Phase 13: batch RS2 disk saves + force-claim ghost sessions after 15s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two targeted fixes based on the 2026-04-22 06:26+ production log run. (1) RS2 disk writes: one batched transaction instead of N sequential REPLACE INTOs Every logout [perf-logout] line showed the same pattern: core=72ms backpacks=6ms ss=5ms rs2=523ms total=606ms core=56ms backpacks=4ms ss=1ms rs2=391ms total=452ms core=77ms backpacks=3ms ss=1ms rs2=409ms total=490ms RS2 dominated the save path. Backpacks + SS were already batched via saveBackpackSnapshots since Phase 7, but saveRS2DisksByLevel still looped saveStorageContents (one REPLACE INTO per disk). Fix: collect every disk's NBT into Map first, then delegate to saveBackpackSnapshots (same table, same batched transaction path with per-entry fallback on failure). Expected ~10x reduction in rs2= duration for players with 3-4 disks. (2) Ghost-session force-claim: absolute 15s cap instead of stale-heartbeat-only Fresh field logs showed the exact scenario Phase 10 left unsolved: 06:26:43 RESTORE started for 95d0db86 06:27:44 RESTORE completed in 60627ms (full poll timeout) 06:58:16 RESTORE started for 5d582bbc 06:59:17 RESTORE completed in 61630ms (full poll timeout) The peer's heartbeat was always fresh (age 2-28s, well under the 60s stale threshold), so Phase 11's 'only force-claim if stale' gate never fired — the loop ran the full 120 attempts. Meanwhile [perf-logout] proves real saves commit in < 1s, so a peer that hasn't flushed after 15s is a ghost session (player disconnected uncleanly, flag stuck at online=1). Waiting another 45s for a save that isn't coming is pure UX cost. Fix: after join_peer_alive_max_wait_seconds (default raised from 5 to 15), force-claim unconditionally. Safe because: - 15s is 15x the max observed save time — real saves are always committed to DB by then. - Phase 2's last_server guard already blocks any late write from the ghost session (the guard logs [GUARD] on the peer's side). - Phase 10 duplication scenario (force-claim before peer's async save commits) can no longer happen with this safer threshold. Peer-truly-stale short-circuit (heartbeat > 60s old) still triggers instantly via the isPeerServerStale() check at the top of the loop — only the 'peer alive but player ghost' path changed semantics. --- .../fubuki/playersync/config/JdbcConfig.java | 13 ++--- .../fubuki/playersync/sync/VanillaSync.java | 55 ++++++++++--------- .../playersync/sync/addons/ModsSupport.java | 12 +++- 3 files changed, 46 insertions(+), 34 deletions(-) diff --git a/src/main/java/vip/fubuki/playersync/config/JdbcConfig.java b/src/main/java/vip/fubuki/playersync/config/JdbcConfig.java index f06e0fa..76c0564 100644 --- a/src/main/java/vip/fubuki/playersync/config/JdbcConfig.java +++ b/src/main/java/vip/fubuki/playersync/config/JdbcConfig.java @@ -178,13 +178,12 @@ public class JdbcConfig { JOIN_PEER_ALIVE_MAX_WAIT_SECONDS = B.comment( "When the previous server is ALIVE (heartbeat fresh) but the player row still", "shows online=1 on it, how long to wait before force-claiming ownership on this", - "server. Prevents the 30-60s 'empty inventory' window when a player active on", - "peer A connects to peer B without cleanly logging out (proxy, network drop,", - "dup session). After this timeout, peer A will simply fail to save this player", - "(blocked by last_server guard) and their next disconnect won't overwrite B's", - "data. Default 5s. Set to 0 to force-claim immediately; set high to restore the", - "legacy behavior of waiting for the peer to flush.") - .defineInRange("join_peer_alive_max_wait_seconds", 5, 0, 600); + "server. Ghost sessions (network drop, proxy bypass, stuck flag) otherwise hold", + "the join hostage up to 60s. Real logout saves consistently complete in <1s in", + "production, so any wait > ~15s means the peer isn't going to flush — force-", + "claim is safe because peer's future saves get blocked by the last_server guard.", + "Default 15s. Set to 0 to force-claim immediately; set high to be more patient.") + .defineInRange("join_peer_alive_max_wait_seconds", 15, 0, 600); POOL_STATS_INTERVAL_MINUTES = B.comment( "How often PoolStatsReporter logs executor + Hikari stats. 0 to disable.") .defineInRange("pool_stats_interval_minutes", 5, 0, 1440); diff --git a/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java b/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java index 194b938..3b4353d 100644 --- a/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java +++ b/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java @@ -457,32 +457,37 @@ public class VanillaSync { // atomic), just maybe slowly on a heavy-load server. long waitedMs = System.currentTimeMillis() - pollStartTime; if (PEER_ALIVE_MAX_WAIT_MS > 0 && waitedMs >= PEER_ALIVE_MAX_WAIT_MS) { - // PHASE 11 FIX: compare peer heartbeat age against the stale - // threshold (default 60s), NOT the 5s wait cap. A normal peer - // with a 30s heartbeat interval naturally has an age of 0-30s - // between ticks — using 5s as the "frozen" threshold caused - // EVERY cross-server join to misclassify a healthy peer as - // frozen and force-claim unnecessarily (observed in prod logs: - // 'heartbeat frozen 5380ms'). + // PHASE 13: absolute wait cap as force-claim trigger. + // Real logout saves complete in <1s in production (measured via + // Phase 10 [perf-logout] breakdown: core+bp+ss+rs2 always 400- + // 600ms). A wait of 15s+ with player still online=1 means the + // peer is NOT going to flush — it's a ghost session (network + // drop, proxy bypass, stuck flag). Keeping users hostage for the + // full 60s poll was the reported 20s-60s join latency. + // + // Force-claim is safe here because: + // 1. Phase 10 duplication risk was 'force-claim BEFORE peer's + // async save commits' — but 15s is 15x the max observed + // save time, so the peer has either committed or never + // will. + // 2. writeSnapshotToDB's last_server guard blocks any future + // write from the peer for this player — their ghost + // session's final save (if it ever comes) silently fails + // and logs [GUARD] instead of overwriting our data. + // + // A separately-stale heartbeat still short-circuits immediately + // (handled above by isPeerServerStale at the start of the loop). long peerAgeMs = peerHeartbeatAgeMs(otherServer); - if (peerAgeMs > STALE_HEARTBEAT_MS) { - SyncLogger.raceCondition(player_uuid, - "Peer " + otherServer + " heartbeat stale " + peerAgeMs - + "ms > " + STALE_HEARTBEAT_MS + "ms, waited " + waitedMs + "ms — force-claiming"); - PlayerSync.LOGGER.warn( - "Player {} waited {}ms for peer {} whose heartbeat is {}ms old (threshold {}ms) — force-claiming", - player_uuid, waitedMs, otherServer, peerAgeMs, STALE_HEARTBEAT_MS); - JDBCsetUp.executePreparedUpdate( - "UPDATE " + Tables.playerData() + " SET online=0 WHERE uuid=? AND last_server=?", - player_uuid, otherServer); - break; - } - // Peer is actively heartbeating but slow to flush — keep waiting. - // Warn sparingly to avoid log flood. - if (attempt % 20 == 0) { - SyncLogger.warnPlayer(player_uuid, - "peer " + otherServer + " healthy but slow to flush (waited=" + waitedMs + "ms, hb_age=" + peerAgeMs + "ms)"); - } + SyncLogger.raceCondition(player_uuid, + "Peer " + otherServer + " ghost session suspected — waited " + + waitedMs + "ms (hb_age=" + peerAgeMs + "ms), force-claiming"); + PlayerSync.LOGGER.warn( + "Player {} force-claiming from peer {} after {}ms wait (hb_age={}ms) — ghost session (peer won't flush)", + player_uuid, otherServer, waitedMs, peerAgeMs); + JDBCsetUp.executePreparedUpdate( + "UPDATE " + Tables.playerData() + " SET online=0 WHERE uuid=? AND last_server=?", + player_uuid, otherServer); + break; } // PHASE 11: log RACE only every 10 attempts instead of every tick. // Previous behavior produced up to 120 lines per cross-server join, diff --git a/src/main/java/vip/fubuki/playersync/sync/addons/ModsSupport.java b/src/main/java/vip/fubuki/playersync/sync/addons/ModsSupport.java index ed31f2f..0bceae2 100644 --- a/src/main/java/vip/fubuki/playersync/sync/addons/ModsSupport.java +++ b/src/main/java/vip/fubuki/playersync/sync/addons/ModsSupport.java @@ -1156,13 +1156,21 @@ public class ModsSupport { net.minecraft.nbt.CompoundTag fullNbt = sd.save(new net.minecraft.nbt.CompoundTag(), registryAccess); + // PHASE 13 PERF: collect all disk NBTs into a single Map and delegate to the + // batched writer. Previous behavior made N sequential REPLACE INTO calls + // (observed as rs2=500ms+ in [perf-logout] breakdowns). One batched transaction + // now dominates by ~10× for players with multiple disks. + Map toSave = new HashMap<>(); for (UUID uuid : diskUuids) { net.minecraft.nbt.CompoundTag entryNbt = findRS2EntryInNbt(fullNbt, uuid.toString()); if (entryNbt != null && !entryNbt.isEmpty()) { - saveStorageContents(uuid, entryNbt); - PlayerSync.LOGGER.info("Saved RS2 disk data for UUID {} (async save)", uuid); + toSave.put(uuid, entryNbt); } } + if (!toSave.isEmpty()) { + saveBackpackSnapshots(toSave); // shared batched writer (backpack_data table) + PlayerSync.LOGGER.info("Saved {} RS2 disk(s) in one batch", toSave.size()); + } } catch (Exception e) { PlayerSync.LOGGER.error("Error saving RS2 disks by level", e); }