Phase 3: anti-loss infrastructure (shutdown hook + heartbeat + crash recovery)

Adds three utilities to harden PlayerSync against ungraceful server exits: CrashRecovery.java - installShutdownHook: registers a non-daemon JVM shutdown hook that calls VanillaSync.emergencyFlushAll() synchronously when the process is killed (SIGTERM, kill, OOM, host reboot). Covers the case where the normal ServerStoppingEvent path never runs. - clearOrphanedOnlineFlags: on startup, clears any online=1 player_data rows pointing to this server_id (left by a previous crash). Reports the count via SyncLogger so admins can see recovery activity. - reportZombiePeers: logs peer server_ids whose heartbeat is missing or stale (>60s), exposing the root of doPlayerJoin poll timeouts. HeartbeatService.java - Single-thread daemon scheduler pinging server_info.last_update every 10s. - Lets peer servers distinguish live from dead via isPeerServerStale(). - Stopped explicitly in VanillaSync.onServerShutdown before pool close. VanillaSync.emergencyFlushAll() - Synchronous best-effort flush for every online player. No executor, no locks — the server is dying, we just want data on disk. Writes player_data, backpacks, SS, RS2 directly; logs SAVE/SKIPPED/FAILED per player via SyncLogger so post-mortem analysis is possible. PlayerSync.onServerStarting wires the four new calls after table init. Fixes the production issue where players remained online=1 forever after kill -9 and the 30s poll timeouts waiting for zombie server_ids.
2026-04-22 05:44:19 +02:00 · 2026-04-22 05:44:19 +02:00 · 746cb56275
commit 746cb56275
parent c84f920d11
4 changed files with 262 additions and 0 deletions
--- a/src/main/java/vip/fubuki/playersync/PlayerSync.java
+++ b/src/main/java/vip/fubuki/playersync/PlayerSync.java
@ -207,6 +207,18 @@ public class PlayerSync {
        } catch (Exception e) {
            LOGGER.error("An exception occurred while trying change wrong player-status\n" + e.getMessage());
        }
        // Phase 3: anti-loss infrastructure.
        //   1. Clear orphaned online=1 flags from previous unclean shutdown.
        //   2. Report zombie peer servers so admins see them in logs.
        //   3. Install JVM shutdown hook — covers kill -9 / OOM / host reboot.
        //   4. Start periodic heartbeat so peers can detect us as alive.
        vip.fubuki.playersync.util.CrashRecovery.clearOrphanedOnlineFlags();
        vip.fubuki.playersync.util.CrashRecovery.reportZombiePeers(60_000L);
        vip.fubuki.playersync.util.CrashRecovery.installShutdownHook(() ->
                vip.fubuki.playersync.sync.VanillaSync.emergencyFlushAll());
        vip.fubuki.playersync.util.HeartbeatService.start();
        LOGGER.info("PlayerSync is ready!");
    }
--- a/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java
+++ b/src/main/java/vip/fubuki/playersync/sync/VanillaSync.java
@ -1067,6 +1067,9 @@ public class VanillaSync {
        }
        JDBCsetUp.executePreparedUpdate("UPDATE " + Tables.serverInfo() + " SET enable=0 WHERE id=?", JdbcConfig.SERVER_ID.get());
        // Phase 3: stop heartbeat before pool shutdown so its tick doesn't race with pool close.
        vip.fubuki.playersync.util.HeartbeatService.stop();
        // Shut down the background executor — no new tasks after this point
        executorService.shutdown();
        try {
@ -1087,6 +1090,57 @@ public class VanillaSync {
        vip.fubuki.playersync.util.SyncLogger.shutdown();
    }
    /**
     * Phase 3 emergency flush invoked from the JVM shutdown hook (kill -9, OOM, host
     * reboot) when {@code onServerShutdown} never ran. Runs on the JVM shutdown thread,
     * synchronously, WITHOUT the executor (which may be already draining or dead).
     *
     * <p>Best-effort: snapshots and writes every still-online player using direct
     * DB calls. No lock acquisition — the server is dying, we just want data on disk.
     * If the DB pool is already closed, we log and exit gracefully.
     */
    public static void emergencyFlushAll() {
        try {
            MinecraftServer server = ServerLifecycleHooks.getCurrentServer();
            if (server == null) {
                PlayerSync.LOGGER.warn("[emergency-flush] no server instance — nothing to flush");
                return;
            }
            int flushed = 0;
            for (ServerPlayer player : server.getPlayerList().getPlayers()) {
                String puuid = player.getUUID().toString();
                if (!player.getTags().contains("player_synced") || player.isDeadOrDying()) continue;
                try {
                    final PlayerDataSnapshot snapshot = snapshotPlayerData(player);
                    final Map<UUID, CompoundTag> backpackSnapshots = ModsSupport.snapshotBackpackData(player);
                    final Map<UUID, CompoundTag> ssSnapshots = ModsSupport.snapshotSSData(ModsSupport.collectSSUuids(player));
                    // Direct synchronous write (no executor, no lock).
                    boolean persisted = writeSnapshotToDB(snapshot, true);
                    if (persisted) {
                        ModsSupport.saveBackpackSnapshots(backpackSnapshots);
                        ModsSupport.saveSSSnapshots(ssSnapshots);
                        if (ModList.get().isLoaded("refinedstorage")) {
                            List<UUID> rs2 = ModsSupport.collectRS2DiskUuids(player);
                            if (!rs2.isEmpty()) {
                                ModsSupport.saveRS2DisksByLevel(rs2, player.serverLevel(), server.registryAccess());
                            }
                        }
                        SyncLogger.saveCompleted(puuid, "EMERGENCY_FLUSH", 0);
                        flushed++;
                    } else {
                        SyncLogger.saveSkipped(puuid, "EMERGENCY_FLUSH", "core guard blocked");
                    }
                } catch (Throwable t) {
                    PlayerSync.LOGGER.error("[emergency-flush] failed for {}: {}", puuid, t.getMessage());
                    SyncLogger.saveFailed(puuid, "EMERGENCY_FLUSH", t.getMessage());
                }
            }
            PlayerSync.LOGGER.warn("[emergency-flush] flushed {} players via shutdown hook", flushed);
        } catch (Throwable t) {
            PlayerSync.LOGGER.error("[emergency-flush] top-level failure", t);
        }
    }
    /**
     * FIX: Logout saves are now FULLY NON-BLOCKING on the main thread.
     *
--- a/src/main/java/vip/fubuki/playersync/util/CrashRecovery.java
+++ b/src/main/java/vip/fubuki/playersync/util/CrashRecovery.java
@ -0,0 +1,131 @@
 package vip.fubuki.playersync.util;
 import vip.fubuki.playersync.PlayerSync;
 import vip.fubuki.playersync.config.JdbcConfig;
 import java.sql.ResultSet;
 import java.util.concurrent.atomic.AtomicBoolean;
 /**
 * Crash-recovery + shutdown-hook helper.
 *
 * <p>Installs a JVM shutdown hook that flushes pending saves and writes a
 * graceful-shutdown marker into {@code server_info}. On next startup, scans
 * {@code player_data} for rows stuck at {@code online=1} on this server and
 * clears them — covers {@code kill -9} / OOM / JVM abort scenarios where the
 * normal ServerStoppingEvent path never ran.
 *
 * <p>Companion of {@link HeartbeatService} which keeps {@code server_info.last_update}
 * fresh so peer servers can detect this one as alive.
 *
 * @author vyrriox
 */
 public final class CrashRecovery {
    private CrashRecovery() {}
    private static final AtomicBoolean HOOK_INSTALLED = new AtomicBoolean(false);
    private static volatile Runnable flushCallback;
    /**
     * Registers a JVM shutdown hook. Called once from PlayerSync.onServerStarting
     * AFTER the DB pool is up. The {@code flushTask} is invoked on JVM shutdown —
     * use it to snapshot all still-online players synchronously (no async executor,
     * the pool may already be draining).
     */
    public static void installShutdownHook(Runnable flushTask) {
        if (!HOOK_INSTALLED.compareAndSet(false, true)) return;
        flushCallback = flushTask;
        Thread hook = new Thread(() -> {
            try {
                PlayerSync.LOGGER.warn("[crash-recovery] JVM shutdown hook fired — flushing pending saves");
                SyncLogger.playerEvent("SYSTEM", "JVM_SHUTDOWN_HOOK", "Flushing pending saves before JVM exit");
                if (flushCallback != null) {
                    try {
                        flushCallback.run();
                    } catch (Throwable t) {
                        PlayerSync.LOGGER.error("[crash-recovery] flush callback threw", t);
                    }
                }
                // Mark this server as gracefully stopped so peers know it's dead.
                try {
                    JDBCsetUp.executePreparedUpdate(
                            "UPDATE " + Tables.serverInfo() + " SET enable=0, last_update=? WHERE id=?",
                            System.currentTimeMillis(), JdbcConfig.SERVER_ID.get());
                } catch (Exception e) {
                    PlayerSync.LOGGER.warn("[crash-recovery] could not mark server stopped: {}", e.getMessage());
                }
            } catch (Throwable t) {
                // NEVER let the hook throw — it would block JVM exit.
                PlayerSync.LOGGER.error("[crash-recovery] hook failed", t);
            }
        }, "PlayerSync-shutdown-hook");
        hook.setDaemon(false); // MUST be non-daemon: daemon threads are killed on exit
        Runtime.getRuntime().addShutdownHook(hook);
        PlayerSync.LOGGER.info("[crash-recovery] JVM shutdown hook installed");
    }
    /**
     * Scans {@code player_data} for orphaned online=1 rows on this server and
     * clears them. Called from PlayerSync.onServerStarting AFTER the tables are
     * created. This is the recovery path for players who were online when the
     * server was killed ungracefully (kill -9, OOM, host reboot).
     */
    public static void clearOrphanedOnlineFlags() {
        int serverId = JdbcConfig.SERVER_ID.get();
        try {
            // Count first so we know what we're about to clear.
            int count = 0;
            try (JDBCsetUp.QueryResult qr = JDBCsetUp.executePreparedQuery(
                    "SELECT COUNT(*) AS c FROM " + Tables.playerData() + " WHERE last_server=? AND online=1",
                    serverId)) {
                ResultSet rs = qr.resultSet();
                if (rs.next()) count = rs.getInt("c");
            }
            JDBCsetUp.executePreparedUpdate(
                    "UPDATE " + Tables.playerData() + " SET online=0 WHERE last_server=? AND online=1",
                    serverId);
            if (count > 0) {
                PlayerSync.LOGGER.warn("[crash-recovery] cleared {} orphan online=1 rows from previous session (server_id={})",
                        count, serverId);
                SyncLogger.playerEvent("SYSTEM", "ORPHAN_CLEAR",
                        "Cleared " + count + " online=1 rows left by previous session crash");
            } else {
                PlayerSync.LOGGER.info("[crash-recovery] no orphan online=1 rows found — previous shutdown was clean");
            }
        } catch (Exception e) {
            PlayerSync.LOGGER.error("[crash-recovery] failed to scan for orphans", e);
        }
    }
    /**
     * Reports peer servers whose heartbeat is stale. Informational — useful to
     * surface zombie server_ids that could trip doPlayerJoin's poll. Called once
     * on startup.
     */
    public static void reportZombiePeers(long staleAfterMs) {
        try (JDBCsetUp.QueryResult qr = JDBCsetUp.executePreparedQuery(
                "SELECT id, last_update FROM " + Tables.serverInfo() + " WHERE enable=1 AND id<>?",
                JdbcConfig.SERVER_ID.get())) {
            ResultSet rs = qr.resultSet();
            long now = System.currentTimeMillis();
            int zombies = 0;
            while (rs.next()) {
                int id = rs.getInt("id");
                long last = rs.getLong("last_update");
                long age = now - last;
                if (id == 0 || age > staleAfterMs) {
                    zombies++;
                    PlayerSync.LOGGER.warn("[crash-recovery] peer server_id={} is zombie (last_update age={}ms, enabled=true)",
                            id, age);
                }
            }
            if (zombies > 0) {
                SyncLogger.playerEvent("SYSTEM", "ZOMBIE_PEERS", zombies + " peer server(s) appear stale");
            }
        } catch (Exception e) {
            PlayerSync.LOGGER.warn("[crash-recovery] zombie peer scan failed: {}", e.getMessage());
        }
    }
 }
--- a/src/main/java/vip/fubuki/playersync/util/HeartbeatService.java
+++ b/src/main/java/vip/fubuki/playersync/util/HeartbeatService.java
@ -0,0 +1,65 @@
 package vip.fubuki.playersync.util;
 import vip.fubuki.playersync.PlayerSync;
 import vip.fubuki.playersync.config.JdbcConfig;
 import java.util.concurrent.Executors;
 import java.util.concurrent.ScheduledExecutorService;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 /**
 * Periodic {@code server_info.last_update} heartbeat.
 *
 * <p>Runs on a dedicated single-threaded scheduler at a fixed interval so peer
 * servers can detect this server as alive via {@code isPeerServerStale()} in
 * {@code VanillaSync.doPlayerJoin}. Without this, a server that stops issuing
 * updates (e.g. hung main thread) would be treated as alive indefinitely by
 * rejoining players on other servers, causing the 30s poll timeouts seen in
 * production logs.
 *
 * @author vyrriox
 */
 public final class HeartbeatService {
    private HeartbeatService() {}
    /** Heartbeat period: 10s. Short enough that a 60s staleness threshold catches real outages. */
    private static final long PERIOD_MS = 10_000L;
    private static final AtomicBoolean RUNNING = new AtomicBoolean(false);
    private static ScheduledExecutorService scheduler;
    public static void start() {
        if (!RUNNING.compareAndSet(false, true)) return;
        scheduler = Executors.newSingleThreadScheduledExecutor(r -> {
            Thread t = new Thread(r, "PlayerSync-heartbeat");
            t.setDaemon(true);
            t.setPriority(Thread.MIN_PRIORITY);
            return t;
        });
        scheduler.scheduleAtFixedRate(HeartbeatService::tick, PERIOD_MS, PERIOD_MS, TimeUnit.MILLISECONDS);
        PlayerSync.LOGGER.info("[heartbeat] started (period={}ms, server_id={})", PERIOD_MS, JdbcConfig.SERVER_ID.get());
    }
    public static void stop() {
        if (!RUNNING.compareAndSet(true, false)) return;
        if (scheduler != null) {
            scheduler.shutdownNow();
            scheduler = null;
        }
        PlayerSync.LOGGER.info("[heartbeat] stopped");
    }
    private static void tick() {
        try {
            int serverId = JdbcConfig.SERVER_ID.get();
            JDBCsetUp.executePreparedUpdate(
                    "UPDATE " + Tables.serverInfo() + " SET last_update=?, enable=1 WHERE id=?",
                    System.currentTimeMillis(), serverId);
        } catch (Throwable t) {
            // Do not kill the scheduler on a transient DB error — log and retry next tick.
            PlayerSync.LOGGER.warn("[heartbeat] tick failed: {}", t.getMessage());
        }
    }
 }