From 746cb5627557675f015a3d87a1df7eafbcba854f Mon Sep 17 00:00:00 2001 From: laforetbrut Date: Wed, 22 Apr 2026 05:44:19 +0200 Subject: [PATCH] Phase 3: anti-loss infrastructure (shutdown hook + heartbeat + crash recovery) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds three utilities to harden PlayerSync against ungraceful server exits: CrashRecovery.java - installShutdownHook: registers a non-daemon JVM shutdown hook that calls VanillaSync.emergencyFlushAll() synchronously when the process is killed (SIGTERM, kill, OOM, host reboot). Covers the case where the normal ServerStoppingEvent path never runs. - clearOrphanedOnlineFlags: on startup, clears any online=1 player_data rows pointing to this server_id (left by a previous crash). Reports the count via SyncLogger so admins can see recovery activity. - reportZombiePeers: logs peer server_ids whose heartbeat is missing or stale (>60s), exposing the root of doPlayerJoin poll timeouts. HeartbeatService.java - Single-thread daemon scheduler pinging server_info.last_update every 10s. - Lets peer servers distinguish live from dead via isPeerServerStale(). - Stopped explicitly in VanillaSync.onServerShutdown before pool close. VanillaSync.emergencyFlushAll() - Synchronous best-effort flush for every online player. No executor, no locks — the server is dying, we just want data on disk. Writes player_data, backpacks, SS, RS2 directly; logs SAVE/SKIPPED/FAILED per player via SyncLogger so post-mortem analysis is possible. PlayerSync.onServerStarting wires the four new calls after table init. Fixes the production issue where players remained online=1 forever after kill -9 and the 30s poll timeouts waiting for zombie server_ids. --- .../vip/fubuki/playersync/PlayerSync.java | 12 ++ .../fubuki/playersync/sync/VanillaSync.java | 54 ++++++++ .../fubuki/playersync/util/CrashRecovery.java | 131 ++++++++++++++++++ .../playersync/util/HeartbeatService.java | 65 +++++++++ 4 files changed, 262 insertions(+) create mode 100644 src/main/java/vip/fubuki/playersync/util/CrashRecovery.java create mode 100644 src/main/java/vip/fubuki/playersync/util/HeartbeatService.java diff --git a/src/main/java/vip/fubuki/playersync/PlayerSync.java b/src/main/java/vip/fubuki/playersync/PlayerSync.java index b2a2208..9479ad0 100644 --- a/src/main/java/vip/fubuki/playersync/PlayerSync.java +++ b/src/main/java/vip/fubuki/playersync/PlayerSync.java @@ -207,6 +207,18 @@ public class PlayerSync { } catch (Exception e) { LOGGER.error("An exception occurred while trying change wrong player-status\n" + e.getMessage()); } + + // Phase 3: anti-loss infrastructure. + // 1. Clear orphaned online=1 flags from previous unclean shutdown. + // 2. Report zombie peer servers so admins see them in logs. + // 3. Install JVM shutdown hook — covers kill -9 / OOM / host reboot. + // 4. Start periodic heartbeat so peers can detect us as alive. + vip.fubuki.playersync.util.CrashRecovery.clearOrphanedOnlineFlags(); + vip.fubuki.playersync.util.CrashRecovery.reportZombiePeers(60_000L); + vip.fubuki.playersync.util.CrashRecovery.installShutdownHook(() -> + vip.fubuki.playersync.sync.VanillaSync.emergencyFlushAll()); + vip.fubuki.playersync.util.HeartbeatService.start(); + LOGGER.info("PlayerSync is ready!"); } diff --git a/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java b/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java index 0ab6d17..9bad5ac 100644 --- a/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java +++ b/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java @@ -1067,6 +1067,9 @@ public class VanillaSync { } JDBCsetUp.executePreparedUpdate("UPDATE " + Tables.serverInfo() + " SET enable=0 WHERE id=?", JdbcConfig.SERVER_ID.get()); + // Phase 3: stop heartbeat before pool shutdown so its tick doesn't race with pool close. + vip.fubuki.playersync.util.HeartbeatService.stop(); + // Shut down the background executor — no new tasks after this point executorService.shutdown(); try { @@ -1087,6 +1090,57 @@ public class VanillaSync { vip.fubuki.playersync.util.SyncLogger.shutdown(); } + /** + * Phase 3 emergency flush invoked from the JVM shutdown hook (kill -9, OOM, host + * reboot) when {@code onServerShutdown} never ran. Runs on the JVM shutdown thread, + * synchronously, WITHOUT the executor (which may be already draining or dead). + * + *

Best-effort: snapshots and writes every still-online player using direct + * DB calls. No lock acquisition — the server is dying, we just want data on disk. + * If the DB pool is already closed, we log and exit gracefully. + */ + public static void emergencyFlushAll() { + try { + MinecraftServer server = ServerLifecycleHooks.getCurrentServer(); + if (server == null) { + PlayerSync.LOGGER.warn("[emergency-flush] no server instance — nothing to flush"); + return; + } + int flushed = 0; + for (ServerPlayer player : server.getPlayerList().getPlayers()) { + String puuid = player.getUUID().toString(); + if (!player.getTags().contains("player_synced") || player.isDeadOrDying()) continue; + try { + final PlayerDataSnapshot snapshot = snapshotPlayerData(player); + final Map backpackSnapshots = ModsSupport.snapshotBackpackData(player); + final Map ssSnapshots = ModsSupport.snapshotSSData(ModsSupport.collectSSUuids(player)); + // Direct synchronous write (no executor, no lock). + boolean persisted = writeSnapshotToDB(snapshot, true); + if (persisted) { + ModsSupport.saveBackpackSnapshots(backpackSnapshots); + ModsSupport.saveSSSnapshots(ssSnapshots); + if (ModList.get().isLoaded("refinedstorage")) { + List rs2 = ModsSupport.collectRS2DiskUuids(player); + if (!rs2.isEmpty()) { + ModsSupport.saveRS2DisksByLevel(rs2, player.serverLevel(), server.registryAccess()); + } + } + SyncLogger.saveCompleted(puuid, "EMERGENCY_FLUSH", 0); + flushed++; + } else { + SyncLogger.saveSkipped(puuid, "EMERGENCY_FLUSH", "core guard blocked"); + } + } catch (Throwable t) { + PlayerSync.LOGGER.error("[emergency-flush] failed for {}: {}", puuid, t.getMessage()); + SyncLogger.saveFailed(puuid, "EMERGENCY_FLUSH", t.getMessage()); + } + } + PlayerSync.LOGGER.warn("[emergency-flush] flushed {} players via shutdown hook", flushed); + } catch (Throwable t) { + PlayerSync.LOGGER.error("[emergency-flush] top-level failure", t); + } + } + /** * FIX: Logout saves are now FULLY NON-BLOCKING on the main thread. * diff --git a/src/main/java/vip/fubuki/playersync/util/CrashRecovery.java b/src/main/java/vip/fubuki/playersync/util/CrashRecovery.java new file mode 100644 index 0000000..d4f8028 --- /dev/null +++ b/src/main/java/vip/fubuki/playersync/util/CrashRecovery.java @@ -0,0 +1,131 @@ +package vip.fubuki.playersync.util; + +import vip.fubuki.playersync.PlayerSync; +import vip.fubuki.playersync.config.JdbcConfig; + +import java.sql.ResultSet; +import java.util.concurrent.atomic.AtomicBoolean; + +/** + * Crash-recovery + shutdown-hook helper. + * + *

Installs a JVM shutdown hook that flushes pending saves and writes a + * graceful-shutdown marker into {@code server_info}. On next startup, scans + * {@code player_data} for rows stuck at {@code online=1} on this server and + * clears them — covers {@code kill -9} / OOM / JVM abort scenarios where the + * normal ServerStoppingEvent path never ran. + * + *

Companion of {@link HeartbeatService} which keeps {@code server_info.last_update} + * fresh so peer servers can detect this one as alive. + * + * @author vyrriox + */ +public final class CrashRecovery { + + private CrashRecovery() {} + + private static final AtomicBoolean HOOK_INSTALLED = new AtomicBoolean(false); + private static volatile Runnable flushCallback; + + /** + * Registers a JVM shutdown hook. Called once from PlayerSync.onServerStarting + * AFTER the DB pool is up. The {@code flushTask} is invoked on JVM shutdown — + * use it to snapshot all still-online players synchronously (no async executor, + * the pool may already be draining). + */ + public static void installShutdownHook(Runnable flushTask) { + if (!HOOK_INSTALLED.compareAndSet(false, true)) return; + flushCallback = flushTask; + + Thread hook = new Thread(() -> { + try { + PlayerSync.LOGGER.warn("[crash-recovery] JVM shutdown hook fired — flushing pending saves"); + SyncLogger.playerEvent("SYSTEM", "JVM_SHUTDOWN_HOOK", "Flushing pending saves before JVM exit"); + if (flushCallback != null) { + try { + flushCallback.run(); + } catch (Throwable t) { + PlayerSync.LOGGER.error("[crash-recovery] flush callback threw", t); + } + } + // Mark this server as gracefully stopped so peers know it's dead. + try { + JDBCsetUp.executePreparedUpdate( + "UPDATE " + Tables.serverInfo() + " SET enable=0, last_update=? WHERE id=?", + System.currentTimeMillis(), JdbcConfig.SERVER_ID.get()); + } catch (Exception e) { + PlayerSync.LOGGER.warn("[crash-recovery] could not mark server stopped: {}", e.getMessage()); + } + } catch (Throwable t) { + // NEVER let the hook throw — it would block JVM exit. + PlayerSync.LOGGER.error("[crash-recovery] hook failed", t); + } + }, "PlayerSync-shutdown-hook"); + hook.setDaemon(false); // MUST be non-daemon: daemon threads are killed on exit + Runtime.getRuntime().addShutdownHook(hook); + PlayerSync.LOGGER.info("[crash-recovery] JVM shutdown hook installed"); + } + + /** + * Scans {@code player_data} for orphaned online=1 rows on this server and + * clears them. Called from PlayerSync.onServerStarting AFTER the tables are + * created. This is the recovery path for players who were online when the + * server was killed ungracefully (kill -9, OOM, host reboot). + */ + public static void clearOrphanedOnlineFlags() { + int serverId = JdbcConfig.SERVER_ID.get(); + try { + // Count first so we know what we're about to clear. + int count = 0; + try (JDBCsetUp.QueryResult qr = JDBCsetUp.executePreparedQuery( + "SELECT COUNT(*) AS c FROM " + Tables.playerData() + " WHERE last_server=? AND online=1", + serverId)) { + ResultSet rs = qr.resultSet(); + if (rs.next()) count = rs.getInt("c"); + } + JDBCsetUp.executePreparedUpdate( + "UPDATE " + Tables.playerData() + " SET online=0 WHERE last_server=? AND online=1", + serverId); + if (count > 0) { + PlayerSync.LOGGER.warn("[crash-recovery] cleared {} orphan online=1 rows from previous session (server_id={})", + count, serverId); + SyncLogger.playerEvent("SYSTEM", "ORPHAN_CLEAR", + "Cleared " + count + " online=1 rows left by previous session crash"); + } else { + PlayerSync.LOGGER.info("[crash-recovery] no orphan online=1 rows found — previous shutdown was clean"); + } + } catch (Exception e) { + PlayerSync.LOGGER.error("[crash-recovery] failed to scan for orphans", e); + } + } + + /** + * Reports peer servers whose heartbeat is stale. Informational — useful to + * surface zombie server_ids that could trip doPlayerJoin's poll. Called once + * on startup. + */ + public static void reportZombiePeers(long staleAfterMs) { + try (JDBCsetUp.QueryResult qr = JDBCsetUp.executePreparedQuery( + "SELECT id, last_update FROM " + Tables.serverInfo() + " WHERE enable=1 AND id<>?", + JdbcConfig.SERVER_ID.get())) { + ResultSet rs = qr.resultSet(); + long now = System.currentTimeMillis(); + int zombies = 0; + while (rs.next()) { + int id = rs.getInt("id"); + long last = rs.getLong("last_update"); + long age = now - last; + if (id == 0 || age > staleAfterMs) { + zombies++; + PlayerSync.LOGGER.warn("[crash-recovery] peer server_id={} is zombie (last_update age={}ms, enabled=true)", + id, age); + } + } + if (zombies > 0) { + SyncLogger.playerEvent("SYSTEM", "ZOMBIE_PEERS", zombies + " peer server(s) appear stale"); + } + } catch (Exception e) { + PlayerSync.LOGGER.warn("[crash-recovery] zombie peer scan failed: {}", e.getMessage()); + } + } +} diff --git a/src/main/java/vip/fubuki/playersync/util/HeartbeatService.java b/src/main/java/vip/fubuki/playersync/util/HeartbeatService.java new file mode 100644 index 0000000..d23caa6 --- /dev/null +++ b/src/main/java/vip/fubuki/playersync/util/HeartbeatService.java @@ -0,0 +1,65 @@ +package vip.fubuki.playersync.util; + +import vip.fubuki.playersync.PlayerSync; +import vip.fubuki.playersync.config.JdbcConfig; + +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; + +/** + * Periodic {@code server_info.last_update} heartbeat. + * + *

Runs on a dedicated single-threaded scheduler at a fixed interval so peer + * servers can detect this server as alive via {@code isPeerServerStale()} in + * {@code VanillaSync.doPlayerJoin}. Without this, a server that stops issuing + * updates (e.g. hung main thread) would be treated as alive indefinitely by + * rejoining players on other servers, causing the 30s poll timeouts seen in + * production logs. + * + * @author vyrriox + */ +public final class HeartbeatService { + + private HeartbeatService() {} + + /** Heartbeat period: 10s. Short enough that a 60s staleness threshold catches real outages. */ + private static final long PERIOD_MS = 10_000L; + + private static final AtomicBoolean RUNNING = new AtomicBoolean(false); + private static ScheduledExecutorService scheduler; + + public static void start() { + if (!RUNNING.compareAndSet(false, true)) return; + scheduler = Executors.newSingleThreadScheduledExecutor(r -> { + Thread t = new Thread(r, "PlayerSync-heartbeat"); + t.setDaemon(true); + t.setPriority(Thread.MIN_PRIORITY); + return t; + }); + scheduler.scheduleAtFixedRate(HeartbeatService::tick, PERIOD_MS, PERIOD_MS, TimeUnit.MILLISECONDS); + PlayerSync.LOGGER.info("[heartbeat] started (period={}ms, server_id={})", PERIOD_MS, JdbcConfig.SERVER_ID.get()); + } + + public static void stop() { + if (!RUNNING.compareAndSet(true, false)) return; + if (scheduler != null) { + scheduler.shutdownNow(); + scheduler = null; + } + PlayerSync.LOGGER.info("[heartbeat] stopped"); + } + + private static void tick() { + try { + int serverId = JdbcConfig.SERVER_ID.get(); + JDBCsetUp.executePreparedUpdate( + "UPDATE " + Tables.serverInfo() + " SET last_update=?, enable=1 WHERE id=?", + System.currentTimeMillis(), serverId); + } catch (Throwable t) { + // Do not kill the scheduler on a transient DB error — log and retry next tick. + PlayerSync.LOGGER.warn("[heartbeat] tick failed: {}", t.getMessage()); + } + } +}