Phase 3: anti-loss infrastructure (shutdown hook + heartbeat + crash recovery)

Adds three utilities to harden PlayerSync against ungraceful server exits:

CrashRecovery.java
  - installShutdownHook: registers a non-daemon JVM shutdown hook that calls
    VanillaSync.emergencyFlushAll() synchronously when the process is killed
    (SIGTERM, kill, OOM, host reboot). Covers the case where the normal
    ServerStoppingEvent path never runs.
  - clearOrphanedOnlineFlags: on startup, clears any online=1 player_data
    rows pointing to this server_id (left by a previous crash). Reports the
    count via SyncLogger so admins can see recovery activity.
  - reportZombiePeers: logs peer server_ids whose heartbeat is missing or
    stale (>60s), exposing the root of doPlayerJoin poll timeouts.

HeartbeatService.java
  - Single-thread daemon scheduler pinging server_info.last_update every 10s.
  - Lets peer servers distinguish live from dead via isPeerServerStale().
  - Stopped explicitly in VanillaSync.onServerShutdown before pool close.

VanillaSync.emergencyFlushAll()
  - Synchronous best-effort flush for every online player. No executor, no
    locks — the server is dying, we just want data on disk. Writes player_data,
    backpacks, SS, RS2 directly; logs SAVE/SKIPPED/FAILED per player via
    SyncLogger so post-mortem analysis is possible.

PlayerSync.onServerStarting wires the four new calls after table init.

Fixes the production issue where players remained online=1 forever after
kill -9 and the 30s poll timeouts waiting for zombie server_ids.
This commit is contained in:
laforetbrut 2026-04-22 05:44:19 +02:00
parent c84f920d11
commit 746cb56275
4 changed files with 262 additions and 0 deletions

View File

@ -207,6 +207,18 @@ public class PlayerSync {
} catch (Exception e) { } catch (Exception e) {
LOGGER.error("An exception occurred while trying change wrong player-status\n" + e.getMessage()); LOGGER.error("An exception occurred while trying change wrong player-status\n" + e.getMessage());
} }
// Phase 3: anti-loss infrastructure.
// 1. Clear orphaned online=1 flags from previous unclean shutdown.
// 2. Report zombie peer servers so admins see them in logs.
// 3. Install JVM shutdown hook covers kill -9 / OOM / host reboot.
// 4. Start periodic heartbeat so peers can detect us as alive.
vip.fubuki.playersync.util.CrashRecovery.clearOrphanedOnlineFlags();
vip.fubuki.playersync.util.CrashRecovery.reportZombiePeers(60_000L);
vip.fubuki.playersync.util.CrashRecovery.installShutdownHook(() ->
vip.fubuki.playersync.sync.VanillaSync.emergencyFlushAll());
vip.fubuki.playersync.util.HeartbeatService.start();
LOGGER.info("PlayerSync is ready!"); LOGGER.info("PlayerSync is ready!");
} }

View File

@ -1067,6 +1067,9 @@ public class VanillaSync {
} }
JDBCsetUp.executePreparedUpdate("UPDATE " + Tables.serverInfo() + " SET enable=0 WHERE id=?", JdbcConfig.SERVER_ID.get()); JDBCsetUp.executePreparedUpdate("UPDATE " + Tables.serverInfo() + " SET enable=0 WHERE id=?", JdbcConfig.SERVER_ID.get());
// Phase 3: stop heartbeat before pool shutdown so its tick doesn't race with pool close.
vip.fubuki.playersync.util.HeartbeatService.stop();
// Shut down the background executor no new tasks after this point // Shut down the background executor no new tasks after this point
executorService.shutdown(); executorService.shutdown();
try { try {
@ -1087,6 +1090,57 @@ public class VanillaSync {
vip.fubuki.playersync.util.SyncLogger.shutdown(); vip.fubuki.playersync.util.SyncLogger.shutdown();
} }
/**
* Phase 3 emergency flush invoked from the JVM shutdown hook (kill -9, OOM, host
* reboot) when {@code onServerShutdown} never ran. Runs on the JVM shutdown thread,
* synchronously, WITHOUT the executor (which may be already draining or dead).
*
* <p>Best-effort: snapshots and writes every still-online player using direct
* DB calls. No lock acquisition the server is dying, we just want data on disk.
* If the DB pool is already closed, we log and exit gracefully.
*/
public static void emergencyFlushAll() {
try {
MinecraftServer server = ServerLifecycleHooks.getCurrentServer();
if (server == null) {
PlayerSync.LOGGER.warn("[emergency-flush] no server instance — nothing to flush");
return;
}
int flushed = 0;
for (ServerPlayer player : server.getPlayerList().getPlayers()) {
String puuid = player.getUUID().toString();
if (!player.getTags().contains("player_synced") || player.isDeadOrDying()) continue;
try {
final PlayerDataSnapshot snapshot = snapshotPlayerData(player);
final Map<UUID, CompoundTag> backpackSnapshots = ModsSupport.snapshotBackpackData(player);
final Map<UUID, CompoundTag> ssSnapshots = ModsSupport.snapshotSSData(ModsSupport.collectSSUuids(player));
// Direct synchronous write (no executor, no lock).
boolean persisted = writeSnapshotToDB(snapshot, true);
if (persisted) {
ModsSupport.saveBackpackSnapshots(backpackSnapshots);
ModsSupport.saveSSSnapshots(ssSnapshots);
if (ModList.get().isLoaded("refinedstorage")) {
List<UUID> rs2 = ModsSupport.collectRS2DiskUuids(player);
if (!rs2.isEmpty()) {
ModsSupport.saveRS2DisksByLevel(rs2, player.serverLevel(), server.registryAccess());
}
}
SyncLogger.saveCompleted(puuid, "EMERGENCY_FLUSH", 0);
flushed++;
} else {
SyncLogger.saveSkipped(puuid, "EMERGENCY_FLUSH", "core guard blocked");
}
} catch (Throwable t) {
PlayerSync.LOGGER.error("[emergency-flush] failed for {}: {}", puuid, t.getMessage());
SyncLogger.saveFailed(puuid, "EMERGENCY_FLUSH", t.getMessage());
}
}
PlayerSync.LOGGER.warn("[emergency-flush] flushed {} players via shutdown hook", flushed);
} catch (Throwable t) {
PlayerSync.LOGGER.error("[emergency-flush] top-level failure", t);
}
}
/** /**
* FIX: Logout saves are now FULLY NON-BLOCKING on the main thread. * FIX: Logout saves are now FULLY NON-BLOCKING on the main thread.
* *

View File

@ -0,0 +1,131 @@
package vip.fubuki.playersync.util;
import vip.fubuki.playersync.PlayerSync;
import vip.fubuki.playersync.config.JdbcConfig;
import java.sql.ResultSet;
import java.util.concurrent.atomic.AtomicBoolean;
/**
* Crash-recovery + shutdown-hook helper.
*
* <p>Installs a JVM shutdown hook that flushes pending saves and writes a
* graceful-shutdown marker into {@code server_info}. On next startup, scans
* {@code player_data} for rows stuck at {@code online=1} on this server and
* clears them covers {@code kill -9} / OOM / JVM abort scenarios where the
* normal ServerStoppingEvent path never ran.
*
* <p>Companion of {@link HeartbeatService} which keeps {@code server_info.last_update}
* fresh so peer servers can detect this one as alive.
*
* @author vyrriox
*/
public final class CrashRecovery {
private CrashRecovery() {}
private static final AtomicBoolean HOOK_INSTALLED = new AtomicBoolean(false);
private static volatile Runnable flushCallback;
/**
* Registers a JVM shutdown hook. Called once from PlayerSync.onServerStarting
* AFTER the DB pool is up. The {@code flushTask} is invoked on JVM shutdown
* use it to snapshot all still-online players synchronously (no async executor,
* the pool may already be draining).
*/
public static void installShutdownHook(Runnable flushTask) {
if (!HOOK_INSTALLED.compareAndSet(false, true)) return;
flushCallback = flushTask;
Thread hook = new Thread(() -> {
try {
PlayerSync.LOGGER.warn("[crash-recovery] JVM shutdown hook fired — flushing pending saves");
SyncLogger.playerEvent("SYSTEM", "JVM_SHUTDOWN_HOOK", "Flushing pending saves before JVM exit");
if (flushCallback != null) {
try {
flushCallback.run();
} catch (Throwable t) {
PlayerSync.LOGGER.error("[crash-recovery] flush callback threw", t);
}
}
// Mark this server as gracefully stopped so peers know it's dead.
try {
JDBCsetUp.executePreparedUpdate(
"UPDATE " + Tables.serverInfo() + " SET enable=0, last_update=? WHERE id=?",
System.currentTimeMillis(), JdbcConfig.SERVER_ID.get());
} catch (Exception e) {
PlayerSync.LOGGER.warn("[crash-recovery] could not mark server stopped: {}", e.getMessage());
}
} catch (Throwable t) {
// NEVER let the hook throw it would block JVM exit.
PlayerSync.LOGGER.error("[crash-recovery] hook failed", t);
}
}, "PlayerSync-shutdown-hook");
hook.setDaemon(false); // MUST be non-daemon: daemon threads are killed on exit
Runtime.getRuntime().addShutdownHook(hook);
PlayerSync.LOGGER.info("[crash-recovery] JVM shutdown hook installed");
}
/**
* Scans {@code player_data} for orphaned online=1 rows on this server and
* clears them. Called from PlayerSync.onServerStarting AFTER the tables are
* created. This is the recovery path for players who were online when the
* server was killed ungracefully (kill -9, OOM, host reboot).
*/
public static void clearOrphanedOnlineFlags() {
int serverId = JdbcConfig.SERVER_ID.get();
try {
// Count first so we know what we're about to clear.
int count = 0;
try (JDBCsetUp.QueryResult qr = JDBCsetUp.executePreparedQuery(
"SELECT COUNT(*) AS c FROM " + Tables.playerData() + " WHERE last_server=? AND online=1",
serverId)) {
ResultSet rs = qr.resultSet();
if (rs.next()) count = rs.getInt("c");
}
JDBCsetUp.executePreparedUpdate(
"UPDATE " + Tables.playerData() + " SET online=0 WHERE last_server=? AND online=1",
serverId);
if (count > 0) {
PlayerSync.LOGGER.warn("[crash-recovery] cleared {} orphan online=1 rows from previous session (server_id={})",
count, serverId);
SyncLogger.playerEvent("SYSTEM", "ORPHAN_CLEAR",
"Cleared " + count + " online=1 rows left by previous session crash");
} else {
PlayerSync.LOGGER.info("[crash-recovery] no orphan online=1 rows found — previous shutdown was clean");
}
} catch (Exception e) {
PlayerSync.LOGGER.error("[crash-recovery] failed to scan for orphans", e);
}
}
/**
* Reports peer servers whose heartbeat is stale. Informational useful to
* surface zombie server_ids that could trip doPlayerJoin's poll. Called once
* on startup.
*/
public static void reportZombiePeers(long staleAfterMs) {
try (JDBCsetUp.QueryResult qr = JDBCsetUp.executePreparedQuery(
"SELECT id, last_update FROM " + Tables.serverInfo() + " WHERE enable=1 AND id<>?",
JdbcConfig.SERVER_ID.get())) {
ResultSet rs = qr.resultSet();
long now = System.currentTimeMillis();
int zombies = 0;
while (rs.next()) {
int id = rs.getInt("id");
long last = rs.getLong("last_update");
long age = now - last;
if (id == 0 || age > staleAfterMs) {
zombies++;
PlayerSync.LOGGER.warn("[crash-recovery] peer server_id={} is zombie (last_update age={}ms, enabled=true)",
id, age);
}
}
if (zombies > 0) {
SyncLogger.playerEvent("SYSTEM", "ZOMBIE_PEERS", zombies + " peer server(s) appear stale");
}
} catch (Exception e) {
PlayerSync.LOGGER.warn("[crash-recovery] zombie peer scan failed: {}", e.getMessage());
}
}
}

View File

@ -0,0 +1,65 @@
package vip.fubuki.playersync.util;
import vip.fubuki.playersync.PlayerSync;
import vip.fubuki.playersync.config.JdbcConfig;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
/**
* Periodic {@code server_info.last_update} heartbeat.
*
* <p>Runs on a dedicated single-threaded scheduler at a fixed interval so peer
* servers can detect this server as alive via {@code isPeerServerStale()} in
* {@code VanillaSync.doPlayerJoin}. Without this, a server that stops issuing
* updates (e.g. hung main thread) would be treated as alive indefinitely by
* rejoining players on other servers, causing the 30s poll timeouts seen in
* production logs.
*
* @author vyrriox
*/
public final class HeartbeatService {
private HeartbeatService() {}
/** Heartbeat period: 10s. Short enough that a 60s staleness threshold catches real outages. */
private static final long PERIOD_MS = 10_000L;
private static final AtomicBoolean RUNNING = new AtomicBoolean(false);
private static ScheduledExecutorService scheduler;
public static void start() {
if (!RUNNING.compareAndSet(false, true)) return;
scheduler = Executors.newSingleThreadScheduledExecutor(r -> {
Thread t = new Thread(r, "PlayerSync-heartbeat");
t.setDaemon(true);
t.setPriority(Thread.MIN_PRIORITY);
return t;
});
scheduler.scheduleAtFixedRate(HeartbeatService::tick, PERIOD_MS, PERIOD_MS, TimeUnit.MILLISECONDS);
PlayerSync.LOGGER.info("[heartbeat] started (period={}ms, server_id={})", PERIOD_MS, JdbcConfig.SERVER_ID.get());
}
public static void stop() {
if (!RUNNING.compareAndSet(true, false)) return;
if (scheduler != null) {
scheduler.shutdownNow();
scheduler = null;
}
PlayerSync.LOGGER.info("[heartbeat] stopped");
}
private static void tick() {
try {
int serverId = JdbcConfig.SERVER_ID.get();
JDBCsetUp.executePreparedUpdate(
"UPDATE " + Tables.serverInfo() + " SET last_update=?, enable=1 WHERE id=?",
System.currentTimeMillis(), serverId);
} catch (Throwable t) {
// Do not kill the scheduler on a transient DB error log and retry next tick.
PlayerSync.LOGGER.warn("[heartbeat] tick failed: {}", t.getMessage());
}
}
}