From 84b2e60f00a842a284d0bef5545b765cb18af957 Mon Sep 17 00:00:00 2001 From: laforetbrut Date: Wed, 22 Apr 2026 09:51:56 +0200 Subject: [PATCH] Phase 14: fix 60s join wait caused by kick-check racing the poll MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The real root cause of 'inventory appears 30-60s after connect' — and it had nothing to do with ghost sessions or heartbeat thresholds. Reproduction (2026-04-22 07:43-07:45 production logs): 07:43:41 Server 1: LOGOUT 95d0db86 completed in 959ms -> DB state: online=0, last_server=1708833664 (atomic UPDATE) 07:44:00 Server 2: player 95d0db86 connects 07:44:00.x onPlayerLoggedInKickCheck executed -> executor.execute(UPDATE SET online=1 WHERE uuid=?) -> DB state: online=1, last_server=1708833664 <-- BUG: we wrote 1 07:44:00.y doPlayerJoin poll: SELECT online, last_server -> sees online=1, last_server=1708833664 -> 'Waiting for server 1708833664 to finish saving' for 60s 07:45:01 poll times out at 120/120, restore completes in 61219ms Server 2 was waiting for Server 1 to flush its save — but Server 1 had ALREADY flushed 19s earlier. Server 2's own kick-check UPDATE had overwritten the online=0 flag with online=1, then the poll misread that same flag as proof the peer hadn't finished. Fix: - onPlayerLoggedInKickCheck no longer writes online=1. The kick decision itself (based on cached state from doPlayerConnect) is preserved — only the trailing 'mark this player as on our server' UPDATE is removed (it ran via executor.execute and raced the poll). - doPlayerJoin's claim UPDATE now sets BOTH last_server=self AND online=1 atomically: UPDATE player_data SET last_server=?, online=1 WHERE uuid=? This is the single source of truth for 'player is now here'. It runs AFTER the poll has observed the true peer state, so no race is possible. Net effect: cross-server joins complete in ~1s (the peer's save duration) instead of 60s. Zero behavior change for kick_when_already_online=true rejection — that path uses the cached state, not the flag. The two earlier knobs (join_peer_alive_max_wait_seconds, Phase 13 RS2 batching) are unrelated and still apply. --- .../fubuki/playersync/sync/VanillaSync.java | 41 ++++++++----------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java b/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java index 3b4353d..fbf7ef7 100644 --- a/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java +++ b/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java @@ -504,11 +504,13 @@ public class VanillaSync { break; // Ready to load — other server finished or same server } - // NOW claim last_server for this server — AFTER the old server's save completed. - // This is safe because: (1) the old server's data+online=0 write already completed, - // (2) any future writes from the old server will be blocked by AND last_server=?. + // PHASE 14 FIX: claim ownership atomically — last_server=self AND online=1. + // Previously the kick check set online=1 upfront, racing this poll and causing + // the poll to see its own write as 'peer still online' (60s wait bug). Now the + // kick check leaves online alone, and this claim is the single source of truth + // for the new ownership state. JDBCsetUp.executePreparedUpdate( - "UPDATE " + Tables.playerData() + " SET last_server=? WHERE uuid=?", + "UPDATE " + Tables.playerData() + " SET last_server=?, online=1 WHERE uuid=?", JdbcConfig.SERVER_ID.get(), player_uuid); // === PHASE 1: DB reads on background thread (thread-safe) === @@ -763,15 +765,14 @@ public class VanillaSync { int[] cached = connectCheckCache.remove(player_uuid); if (!JdbcConfig.KICK_WHEN_ALREADY_ONLINE.get()) { - // FIX PERF (C1): online=1 is fire-and-forget; no login-critical decision depends - // on the write completing synchronously. Keeping this off the main thread saves - // one MySQL round-trip per join. - executorService.execute(() -> { - try { - JDBCsetUp.executePreparedUpdate( - "UPDATE " + Tables.playerData() + " SET online=1 WHERE uuid=?", player_uuid); - } catch (SQLException ignored) {} - }); + // PHASE 14 FIX: do NOT pre-mark online=1 here. Previously this UPDATE ran on + // the executor BEFORE doPlayerJoin's poll, overwriting a peer's freshly-committed + // online=0 — the poll would then see online=1 + last_server=OldPeer and wait the + // full 60s even though the peer had already flushed (observed in production logs + // 2026-04-22 07:43:41 -> 07:45:01, 60s of 'Waiting for server X to finish saving' + // when X had actually committed 19s earlier). + // doPlayerJoin now sets online=1 atomically with last_server=self as part of its + // claim UPDATE, after the poll has seen the true state. return; } @@ -823,17 +824,9 @@ public class VanillaSync { } } - // FIX PERF (C1): Mark online=1 asynchronously — no main-thread MySQL round-trip. - // The cache-based kick decision above is already final; this write only updates - // the persistent flag for cross-server detection, which tolerates a few ms of delay. - executorService.execute(() -> { - try { - JDBCsetUp.executePreparedUpdate( - "UPDATE " + Tables.playerData() + " SET online=1 WHERE uuid=?", player_uuid); - } catch (SQLException e) { - PlayerSync.LOGGER.error("Async online=1 update failed for {}", player_uuid, e); - } - }); + // PHASE 14 FIX: online=1 is no longer written here. See doPlayerJoin's claim + // UPDATE for the replacement — setting the flag earlier raced the poll and + // caused every cross-server join to wait the full 60s. } catch (Exception e) { PlayerSync.LOGGER.error("Error during kick check for player {}", player_uuid, e); }