Phase 9: cap wait time on alive-peer ghost sessions (fixes 30-60s join delay)

Reproduction (from production logs, 2026-04-22):
  02:54:13 - 02:54:44  player 724b9ff8 waits 30s for server 1708833664 (60 attempts)
  02:54:31 - 02:55:02  player 46284b41 waits 30s for server 0 (zombie)
  05:10:53 - 05:11:55  player 95d0db86 waits 62s for server 1708833664 (120 attempts)
  05:10:59 - 05:12:01  player 724b9ff8 waits 62s for server 1708833664 (120 attempts)

User report: 'un joueur se connecte et son inventaire s'affiche 30 secondes
après sa connexion'.

Root cause: doPlayerJoin's last_server poll waits for the previous server to
clear online=0. If the peer is alive (heartbeat fresh) but the player is
ghost-online there (proxy bypass, network drop, or actively playing on the
other server without clean logout), the peer NEVER flushes → we wait the
full join_poll_max_attempts * join_poll_interval_ms (60s default) for
nothing. Meanwhile the player sees an empty inventory on this server.

The zombie-peer short-circuit already handled dead peers. This commit adds
the complementary case: ALIVE peers with a stuck session.

Fix:
  - New config key join_peer_alive_max_wait_seconds (default 5, range 0-600).
  - When the peer's heartbeat is fresh but player.online is still 1,
    wait at most this many seconds, then force-claim ownership by setting
    online=0 AND last_server=self.
  - The peer will be prevented from overwriting us: writeSnapshotToDB
    already has the last_server guard (added in Phase 2) which blocks any
    future save the peer issues for this player — they see a GUARD log
    and skip downstream backpack/SS/RS2 writes.
  - Default 5s is a reasonable trade-off: legitimate slow flushes complete
    within that window, ghost sessions don't block the player 60s+.
  - Set to 0 to force-claim immediately (most aggressive, best for proxies).
  - Set high to restore the legacy behavior (wait full poll length).

Also removed the per-tick 'Player X still being saved...' LOGGER.info line
that was spamming the Minecraft server log every 500ms during a ghost wait
— the SyncLogger.raceCondition entry already captures the same information
in the dedicated sync.log and avoids polluting server.log with 120+ lines
per join.
This commit is contained in:
laforetbrut 2026-04-22 07:16:47 +02:00
parent 131aa64eb1
commit b670794d9a
2 changed files with 38 additions and 5 deletions

View File

@ -57,6 +57,7 @@ public class JdbcConfig {
public static ModConfigSpec.IntValue PEER_STALE_THRESHOLD_SECONDS;
public static ModConfigSpec.IntValue JOIN_POLL_MAX_ATTEMPTS;
public static ModConfigSpec.IntValue JOIN_POLL_INTERVAL_MS;
public static ModConfigSpec.IntValue JOIN_PEER_ALIVE_MAX_WAIT_SECONDS;
public static ModConfigSpec.IntValue POOL_STATS_INTERVAL_MINUTES;
public static ModConfigSpec.IntValue HIKARI_POOL_MAX_SIZE;
public static ModConfigSpec.IntValue HIKARI_LEAK_THRESHOLD_MS;
@ -174,6 +175,16 @@ public class JdbcConfig {
JOIN_POLL_INTERVAL_MS = B.comment(
"Wait interval between last_server poll attempts (milliseconds).")
.defineInRange("join_poll_interval_ms", 500, 100, 5000);
JOIN_PEER_ALIVE_MAX_WAIT_SECONDS = B.comment(
"When the previous server is ALIVE (heartbeat fresh) but the player row still",
"shows online=1 on it, how long to wait before force-claiming ownership on this",
"server. Prevents the 30-60s 'empty inventory' window when a player active on",
"peer A connects to peer B without cleanly logging out (proxy, network drop,",
"dup session). After this timeout, peer A will simply fail to save this player",
"(blocked by last_server guard) and their next disconnect won't overwrite B's",
"data. Default 5s. Set to 0 to force-claim immediately; set high to restore the",
"legacy behavior of waiting for the peer to flush.")
.defineInRange("join_peer_alive_max_wait_seconds", 5, 0, 600);
POOL_STATS_INTERVAL_MINUTES = B.comment(
"How often PoolStatsReporter logs executor + Hikari stats. 0 to disable.")
.defineInRange("pool_stats_interval_minutes", 5, 0, 1440);

View File

@ -390,6 +390,14 @@ public class VanillaSync {
final int MAX_POLL = JdbcConfig.JOIN_POLL_MAX_ATTEMPTS.get();
final int POLL_INTERVAL_MS = JdbcConfig.JOIN_POLL_INTERVAL_MS.get();
final long STALE_HEARTBEAT_MS = JdbcConfig.PEER_STALE_THRESHOLD_SECONDS.get() * 1000L;
// PHASE 9: when the peer is alive (heartbeat fresh) but the player row still shows
// online=1 on it typical of a ghost session (proxy, network drop, or the user
// walking between servers without clean logout) waiting the full 60s is useless:
// the peer will never flush because the session is technically active there. We
// cap the wait at this shorter window, then force-claim and rely on the last_server
// guard in writeSnapshotToDB to prevent the peer from overwriting us later.
final long PEER_ALIVE_MAX_WAIT_MS = JdbcConfig.JOIN_PEER_ALIVE_MAX_WAIT_SECONDS.get() * 1000L;
final long pollStartTime = System.currentTimeMillis();
for (int attempt = 0; attempt < MAX_POLL; attempt++) {
try (JDBCsetUp.QueryResult qrCheck = JDBCsetUp.executePreparedQuery(
"SELECT online, last_server FROM " + Tables.playerData() + " WHERE uuid=?", player_uuid)) {
@ -402,20 +410,34 @@ public class VanillaSync {
// FIX P1-3: zombie-server short-circuit. server_id=0 is never
// a legitimate server (SERVER_ID config generates nextInt(1, MAX-1)).
// Absent or stale (>60s) heartbeat => treat as dead, take over.
if (otherServer == 0 || isPeerServerStale(otherServer, STALE_HEARTBEAT_MS)) {
boolean peerDead = (otherServer == 0 || isPeerServerStale(otherServer, STALE_HEARTBEAT_MS));
if (peerDead) {
SyncLogger.raceCondition(player_uuid,
"Peer server " + otherServer + " is dead/zombie — taking over after " + attempt + " attempts");
PlayerSync.LOGGER.warn("Player {} last_server={} is dead/zombie, bypassing wait",
player_uuid, otherServer);
// Force-clear its online flag so subsequent logic proceeds cleanly.
JDBCsetUp.executePreparedUpdate(
"UPDATE " + Tables.playerData() + " SET online=0 WHERE uuid=? AND last_server=?",
player_uuid, otherServer);
break;
}
SyncLogger.raceCondition(player_uuid, "Waiting for server " + otherServer + " to finish saving (attempt " + (attempt + 1) + "/" + MAX_POLL + ")");
PlayerSync.LOGGER.info("Player {} still being saved on server {} (attempt {}/{}), waiting 500ms...",
player_uuid, otherServer, attempt + 1, MAX_POLL);
// PHASE 9: peer ALIVE but session hasn't flushed enforce short cap.
long waitedMs = System.currentTimeMillis() - pollStartTime;
if (waitedMs >= PEER_ALIVE_MAX_WAIT_MS) {
SyncLogger.raceCondition(player_uuid,
"Peer server " + otherServer + " is alive but player ghost-online for "
+ waitedMs + "ms — force-claiming ownership");
PlayerSync.LOGGER.warn(
"Player {} still online=1 on alive peer server {} after {}ms — force-takeover on this server. Peer's future saves will be blocked by the last_server guard.",
player_uuid, otherServer, waitedMs);
// Force-clear the flag and claim writeSnapshotToDB will guard
// the peer's next write so they can't overwrite us.
JDBCsetUp.executePreparedUpdate(
"UPDATE " + Tables.playerData() + " SET online=0 WHERE uuid=? AND last_server=?",
player_uuid, otherServer);
break;
}
SyncLogger.raceCondition(player_uuid, "Waiting for server " + otherServer + " to finish saving (attempt " + (attempt + 1) + "/" + MAX_POLL + ", waited=" + waitedMs + "ms)");
Thread.sleep(POLL_INTERVAL_MS);
continue;
}