From fa7033fdea214d1cb2a0e81036be38aea5460a4e Mon Sep 17 00:00:00 2001
From: laforetbrut <vyrriox@gmail.com>
Date: Wed, 22 Apr 2026 09:04:53 +0200
Subject: [PATCH] Phase 13: batch RS2 disk saves + force-claim ghost sessions
 after 15s
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two targeted fixes based on the 2026-04-22 06:26+ production log run.

(1) RS2 disk writes: one batched transaction instead of N sequential REPLACE INTOs
    Every logout [perf-logout] line showed the same pattern:
        core=72ms  backpacks=6ms  ss=5ms  rs2=523ms  total=606ms
        core=56ms  backpacks=4ms  ss=1ms  rs2=391ms  total=452ms
        core=77ms  backpacks=3ms  ss=1ms  rs2=409ms  total=490ms
    RS2 dominated the save path. Backpacks + SS were already batched via
    saveBackpackSnapshots since Phase 7, but saveRS2DisksByLevel still
    looped saveStorageContents (one REPLACE INTO per disk).

    Fix: collect every disk's NBT into Map<UUID, CompoundTag> first, then
    delegate to saveBackpackSnapshots (same table, same batched transaction
    path with per-entry fallback on failure). Expected ~10x reduction in
    rs2= duration for players with 3-4 disks.

(2) Ghost-session force-claim: absolute 15s cap instead of stale-heartbeat-only
    Fresh field logs showed the exact scenario Phase 10 left unsolved:
        06:26:43  RESTORE started for 95d0db86
        06:27:44  RESTORE completed in 60627ms   (full poll timeout)
        06:58:16  RESTORE started for 5d582bbc
        06:59:17  RESTORE completed in 61630ms   (full poll timeout)
    The peer's heartbeat was always fresh (age 2-28s, well under the 60s
    stale threshold), so Phase 11's 'only force-claim if stale' gate never
    fired — the loop ran the full 120 attempts. Meanwhile [perf-logout]
    proves real saves commit in < 1s, so a peer that hasn't flushed after
    15s is a ghost session (player disconnected uncleanly, flag stuck at
    online=1). Waiting another 45s for a save that isn't coming is pure
    UX cost.

    Fix: after join_peer_alive_max_wait_seconds (default raised from 5 to
    15), force-claim unconditionally. Safe because:
      - 15s is 15x the max observed save time — real saves are always
        committed to DB by then.
      - Phase 2's last_server guard already blocks any late write from
        the ghost session (the guard logs [GUARD] on the peer's side).
      - Phase 10 duplication scenario (force-claim before peer's async
        save commits) can no longer happen with this safer threshold.

    Peer-truly-stale short-circuit (heartbeat > 60s old) still triggers
    instantly via the isPeerServerStale() check at the top of the loop —
    only the 'peer alive but player ghost' path changed semantics.
---
 .../fubuki/playersync/config/JdbcConfig.java  | 13 ++---
 .../fubuki/playersync/sync/VanillaSync.java   | 55 ++++++++++---------
 .../playersync/sync/addons/ModsSupport.java   | 12 +++-
 3 files changed, 46 insertions(+), 34 deletions(-)

diff --git a/src/main/java/vip/fubuki/playersync/config/JdbcConfig.java b/src/main/java/vip/fubuki/playersync/config/JdbcConfig.java
index f06e0fa..76c0564 100644
--- a/src/main/java/vip/fubuki/playersync/config/JdbcConfig.java
+++ b/src/main/java/vip/fubuki/playersync/config/JdbcConfig.java
@@ -178,13 +178,12 @@ public class JdbcConfig {
         JOIN_PEER_ALIVE_MAX_WAIT_SECONDS = B.comment(
                 "When the previous server is ALIVE (heartbeat fresh) but the player row still",
                 "shows online=1 on it, how long to wait before force-claiming ownership on this",
-                "server. Prevents the 30-60s 'empty inventory' window when a player active on",
-                "peer A connects to peer B without cleanly logging out (proxy, network drop,",
-                "dup session). After this timeout, peer A will simply fail to save this player",
-                "(blocked by last_server guard) and their next disconnect won't overwrite B's",
-                "data. Default 5s. Set to 0 to force-claim immediately; set high to restore the",
-                "legacy behavior of waiting for the peer to flush.")
-                .defineInRange("join_peer_alive_max_wait_seconds", 5, 0, 600);
+                "server. Ghost sessions (network drop, proxy bypass, stuck flag) otherwise hold",
+                "the join hostage up to 60s. Real logout saves consistently complete in <1s in",
+                "production, so any wait > ~15s means the peer isn't going to flush — force-",
+                "claim is safe because peer's future saves get blocked by the last_server guard.",
+                "Default 15s. Set to 0 to force-claim immediately; set high to be more patient.")
+                .defineInRange("join_peer_alive_max_wait_seconds", 15, 0, 600);
         POOL_STATS_INTERVAL_MINUTES = B.comment(
                 "How often PoolStatsReporter logs executor + Hikari stats. 0 to disable.")
                 .defineInRange("pool_stats_interval_minutes", 5, 0, 1440);
diff --git a/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java b/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java
index 194b938..3b4353d 100644
--- a/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java
+++ b/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java
@@ -457,32 +457,37 @@ public class VanillaSync {
                             // atomic), just maybe slowly on a heavy-load server.
                             long waitedMs = System.currentTimeMillis() - pollStartTime;
                             if (PEER_ALIVE_MAX_WAIT_MS > 0 && waitedMs >= PEER_ALIVE_MAX_WAIT_MS) {
-                                // PHASE 11 FIX: compare peer heartbeat age against the stale
-                                // threshold (default 60s), NOT the 5s wait cap. A normal peer
-                                // with a 30s heartbeat interval naturally has an age of 0-30s
-                                // between ticks — using 5s as the "frozen" threshold caused
-                                // EVERY cross-server join to misclassify a healthy peer as
-                                // frozen and force-claim unnecessarily (observed in prod logs:
-                                // 'heartbeat frozen 5380ms').
+                                // PHASE 13: absolute wait cap as force-claim trigger.
+                                // Real logout saves complete in <1s in production (measured via
+                                // Phase 10 [perf-logout] breakdown: core+bp+ss+rs2 always 400-
+                                // 600ms). A wait of 15s+ with player still online=1 means the
+                                // peer is NOT going to flush — it's a ghost session (network
+                                // drop, proxy bypass, stuck flag). Keeping users hostage for the
+                                // full 60s poll was the reported 20s-60s join latency.
+                                //
+                                // Force-claim is safe here because:
+                                //   1. Phase 10 duplication risk was 'force-claim BEFORE peer's
+                                //      async save commits' — but 15s is 15x the max observed
+                                //      save time, so the peer has either committed or never
+                                //      will.
+                                //   2. writeSnapshotToDB's last_server guard blocks any future
+                                //      write from the peer for this player — their ghost
+                                //      session's final save (if it ever comes) silently fails
+                                //      and logs [GUARD] instead of overwriting our data.
+                                //
+                                // A separately-stale heartbeat still short-circuits immediately
+                                // (handled above by isPeerServerStale at the start of the loop).
                                 long peerAgeMs = peerHeartbeatAgeMs(otherServer);
-                                if (peerAgeMs > STALE_HEARTBEAT_MS) {
-                                    SyncLogger.raceCondition(player_uuid,
-                                            "Peer " + otherServer + " heartbeat stale " + peerAgeMs
-                                                    + "ms > " + STALE_HEARTBEAT_MS + "ms, waited " + waitedMs + "ms — force-claiming");
-                                    PlayerSync.LOGGER.warn(
-                                            "Player {} waited {}ms for peer {} whose heartbeat is {}ms old (threshold {}ms) — force-claiming",
-                                            player_uuid, waitedMs, otherServer, peerAgeMs, STALE_HEARTBEAT_MS);
-                                    JDBCsetUp.executePreparedUpdate(
-                                            "UPDATE " + Tables.playerData() + " SET online=0 WHERE uuid=? AND last_server=?",
-                                            player_uuid, otherServer);
-                                    break;
-                                }
-                                // Peer is actively heartbeating but slow to flush — keep waiting.
-                                // Warn sparingly to avoid log flood.
-                                if (attempt % 20 == 0) {
-                                    SyncLogger.warnPlayer(player_uuid,
-                                            "peer " + otherServer + " healthy but slow to flush (waited=" + waitedMs + "ms, hb_age=" + peerAgeMs + "ms)");
-                                }
+                                SyncLogger.raceCondition(player_uuid,
+                                        "Peer " + otherServer + " ghost session suspected — waited "
+                                                + waitedMs + "ms (hb_age=" + peerAgeMs + "ms), force-claiming");
+                                PlayerSync.LOGGER.warn(
+                                        "Player {} force-claiming from peer {} after {}ms wait (hb_age={}ms) — ghost session (peer won't flush)",
+                                        player_uuid, otherServer, waitedMs, peerAgeMs);
+                                JDBCsetUp.executePreparedUpdate(
+                                        "UPDATE " + Tables.playerData() + " SET online=0 WHERE uuid=? AND last_server=?",
+                                        player_uuid, otherServer);
+                                break;
                             }
                             // PHASE 11: log RACE only every 10 attempts instead of every tick.
                             // Previous behavior produced up to 120 lines per cross-server join,
diff --git a/src/main/java/vip/fubuki/playersync/sync/addons/ModsSupport.java b/src/main/java/vip/fubuki/playersync/sync/addons/ModsSupport.java
index ed31f2f..0bceae2 100644
--- a/src/main/java/vip/fubuki/playersync/sync/addons/ModsSupport.java
+++ b/src/main/java/vip/fubuki/playersync/sync/addons/ModsSupport.java
@@ -1156,13 +1156,21 @@ public class ModsSupport {
 
             net.minecraft.nbt.CompoundTag fullNbt = sd.save(new net.minecraft.nbt.CompoundTag(), registryAccess);
 
+            // PHASE 13 PERF: collect all disk NBTs into a single Map and delegate to the
+            // batched writer. Previous behavior made N sequential REPLACE INTO calls
+            // (observed as rs2=500ms+ in [perf-logout] breakdowns). One batched transaction
+            // now dominates by ~10× for players with multiple disks.
+            Map<UUID, CompoundTag> toSave = new HashMap<>();
             for (UUID uuid : diskUuids) {
                 net.minecraft.nbt.CompoundTag entryNbt = findRS2EntryInNbt(fullNbt, uuid.toString());
                 if (entryNbt != null && !entryNbt.isEmpty()) {
-                    saveStorageContents(uuid, entryNbt);
-                    PlayerSync.LOGGER.info("Saved RS2 disk data for UUID {} (async save)", uuid);
+                    toSave.put(uuid, entryNbt);
                 }
             }
+            if (!toSave.isEmpty()) {
+                saveBackpackSnapshots(toSave); // shared batched writer (backpack_data table)
+                PlayerSync.LOGGER.info("Saved {} RS2 disk(s) in one batch", toSave.size());
+            }
         } catch (Exception e) {
             PlayerSync.LOGGER.error("Error saving RS2 disks by level", e);
         }