Phase 3: anti-loss infrastructure (shutdown hook + heartbeat + crash recovery)
Adds three utilities to harden PlayerSync against ungraceful server exits:
CrashRecovery.java
- installShutdownHook: registers a non-daemon JVM shutdown hook that calls
VanillaSync.emergencyFlushAll() synchronously when the process is killed
(SIGTERM, kill, OOM, host reboot). Covers the case where the normal
ServerStoppingEvent path never runs.
- clearOrphanedOnlineFlags: on startup, clears any online=1 player_data
rows pointing to this server_id (left by a previous crash). Reports the
count via SyncLogger so admins can see recovery activity.
- reportZombiePeers: logs peer server_ids whose heartbeat is missing or
stale (>60s), exposing the root of doPlayerJoin poll timeouts.
HeartbeatService.java
- Single-thread daemon scheduler pinging server_info.last_update every 10s.
- Lets peer servers distinguish live from dead via isPeerServerStale().
- Stopped explicitly in VanillaSync.onServerShutdown before pool close.
VanillaSync.emergencyFlushAll()
- Synchronous best-effort flush for every online player. No executor, no
locks — the server is dying, we just want data on disk. Writes player_data,
backpacks, SS, RS2 directly; logs SAVE/SKIPPED/FAILED per player via
SyncLogger so post-mortem analysis is possible.
PlayerSync.onServerStarting wires the four new calls after table init.
Fixes the production issue where players remained online=1 forever after
kill -9 and the 30s poll timeouts waiting for zombie server_ids.
This commit is contained in:
parent
c84f920d11
commit
746cb56275
|
|
@ -207,6 +207,18 @@ public class PlayerSync {
|
|||
} catch (Exception e) {
|
||||
LOGGER.error("An exception occurred while trying change wrong player-status\n" + e.getMessage());
|
||||
}
|
||||
|
||||
// Phase 3: anti-loss infrastructure.
|
||||
// 1. Clear orphaned online=1 flags from previous unclean shutdown.
|
||||
// 2. Report zombie peer servers so admins see them in logs.
|
||||
// 3. Install JVM shutdown hook — covers kill -9 / OOM / host reboot.
|
||||
// 4. Start periodic heartbeat so peers can detect us as alive.
|
||||
vip.fubuki.playersync.util.CrashRecovery.clearOrphanedOnlineFlags();
|
||||
vip.fubuki.playersync.util.CrashRecovery.reportZombiePeers(60_000L);
|
||||
vip.fubuki.playersync.util.CrashRecovery.installShutdownHook(() ->
|
||||
vip.fubuki.playersync.sync.VanillaSync.emergencyFlushAll());
|
||||
vip.fubuki.playersync.util.HeartbeatService.start();
|
||||
|
||||
LOGGER.info("PlayerSync is ready!");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1067,6 +1067,9 @@ public class VanillaSync {
|
|||
}
|
||||
JDBCsetUp.executePreparedUpdate("UPDATE " + Tables.serverInfo() + " SET enable=0 WHERE id=?", JdbcConfig.SERVER_ID.get());
|
||||
|
||||
// Phase 3: stop heartbeat before pool shutdown so its tick doesn't race with pool close.
|
||||
vip.fubuki.playersync.util.HeartbeatService.stop();
|
||||
|
||||
// Shut down the background executor — no new tasks after this point
|
||||
executorService.shutdown();
|
||||
try {
|
||||
|
|
@ -1087,6 +1090,57 @@ public class VanillaSync {
|
|||
vip.fubuki.playersync.util.SyncLogger.shutdown();
|
||||
}
|
||||
|
||||
/**
|
||||
* Phase 3 emergency flush invoked from the JVM shutdown hook (kill -9, OOM, host
|
||||
* reboot) when {@code onServerShutdown} never ran. Runs on the JVM shutdown thread,
|
||||
* synchronously, WITHOUT the executor (which may be already draining or dead).
|
||||
*
|
||||
* <p>Best-effort: snapshots and writes every still-online player using direct
|
||||
* DB calls. No lock acquisition — the server is dying, we just want data on disk.
|
||||
* If the DB pool is already closed, we log and exit gracefully.
|
||||
*/
|
||||
public static void emergencyFlushAll() {
|
||||
try {
|
||||
MinecraftServer server = ServerLifecycleHooks.getCurrentServer();
|
||||
if (server == null) {
|
||||
PlayerSync.LOGGER.warn("[emergency-flush] no server instance — nothing to flush");
|
||||
return;
|
||||
}
|
||||
int flushed = 0;
|
||||
for (ServerPlayer player : server.getPlayerList().getPlayers()) {
|
||||
String puuid = player.getUUID().toString();
|
||||
if (!player.getTags().contains("player_synced") || player.isDeadOrDying()) continue;
|
||||
try {
|
||||
final PlayerDataSnapshot snapshot = snapshotPlayerData(player);
|
||||
final Map<UUID, CompoundTag> backpackSnapshots = ModsSupport.snapshotBackpackData(player);
|
||||
final Map<UUID, CompoundTag> ssSnapshots = ModsSupport.snapshotSSData(ModsSupport.collectSSUuids(player));
|
||||
// Direct synchronous write (no executor, no lock).
|
||||
boolean persisted = writeSnapshotToDB(snapshot, true);
|
||||
if (persisted) {
|
||||
ModsSupport.saveBackpackSnapshots(backpackSnapshots);
|
||||
ModsSupport.saveSSSnapshots(ssSnapshots);
|
||||
if (ModList.get().isLoaded("refinedstorage")) {
|
||||
List<UUID> rs2 = ModsSupport.collectRS2DiskUuids(player);
|
||||
if (!rs2.isEmpty()) {
|
||||
ModsSupport.saveRS2DisksByLevel(rs2, player.serverLevel(), server.registryAccess());
|
||||
}
|
||||
}
|
||||
SyncLogger.saveCompleted(puuid, "EMERGENCY_FLUSH", 0);
|
||||
flushed++;
|
||||
} else {
|
||||
SyncLogger.saveSkipped(puuid, "EMERGENCY_FLUSH", "core guard blocked");
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
PlayerSync.LOGGER.error("[emergency-flush] failed for {}: {}", puuid, t.getMessage());
|
||||
SyncLogger.saveFailed(puuid, "EMERGENCY_FLUSH", t.getMessage());
|
||||
}
|
||||
}
|
||||
PlayerSync.LOGGER.warn("[emergency-flush] flushed {} players via shutdown hook", flushed);
|
||||
} catch (Throwable t) {
|
||||
PlayerSync.LOGGER.error("[emergency-flush] top-level failure", t);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* FIX: Logout saves are now FULLY NON-BLOCKING on the main thread.
|
||||
*
|
||||
|
|
|
|||
131
src/main/java/vip/fubuki/playersync/util/CrashRecovery.java
Normal file
131
src/main/java/vip/fubuki/playersync/util/CrashRecovery.java
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
package vip.fubuki.playersync.util;
|
||||
|
||||
import vip.fubuki.playersync.PlayerSync;
|
||||
import vip.fubuki.playersync.config.JdbcConfig;
|
||||
|
||||
import java.sql.ResultSet;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
/**
|
||||
* Crash-recovery + shutdown-hook helper.
|
||||
*
|
||||
* <p>Installs a JVM shutdown hook that flushes pending saves and writes a
|
||||
* graceful-shutdown marker into {@code server_info}. On next startup, scans
|
||||
* {@code player_data} for rows stuck at {@code online=1} on this server and
|
||||
* clears them — covers {@code kill -9} / OOM / JVM abort scenarios where the
|
||||
* normal ServerStoppingEvent path never ran.
|
||||
*
|
||||
* <p>Companion of {@link HeartbeatService} which keeps {@code server_info.last_update}
|
||||
* fresh so peer servers can detect this one as alive.
|
||||
*
|
||||
* @author vyrriox
|
||||
*/
|
||||
public final class CrashRecovery {
|
||||
|
||||
private CrashRecovery() {}
|
||||
|
||||
private static final AtomicBoolean HOOK_INSTALLED = new AtomicBoolean(false);
|
||||
private static volatile Runnable flushCallback;
|
||||
|
||||
/**
|
||||
* Registers a JVM shutdown hook. Called once from PlayerSync.onServerStarting
|
||||
* AFTER the DB pool is up. The {@code flushTask} is invoked on JVM shutdown —
|
||||
* use it to snapshot all still-online players synchronously (no async executor,
|
||||
* the pool may already be draining).
|
||||
*/
|
||||
public static void installShutdownHook(Runnable flushTask) {
|
||||
if (!HOOK_INSTALLED.compareAndSet(false, true)) return;
|
||||
flushCallback = flushTask;
|
||||
|
||||
Thread hook = new Thread(() -> {
|
||||
try {
|
||||
PlayerSync.LOGGER.warn("[crash-recovery] JVM shutdown hook fired — flushing pending saves");
|
||||
SyncLogger.playerEvent("SYSTEM", "JVM_SHUTDOWN_HOOK", "Flushing pending saves before JVM exit");
|
||||
if (flushCallback != null) {
|
||||
try {
|
||||
flushCallback.run();
|
||||
} catch (Throwable t) {
|
||||
PlayerSync.LOGGER.error("[crash-recovery] flush callback threw", t);
|
||||
}
|
||||
}
|
||||
// Mark this server as gracefully stopped so peers know it's dead.
|
||||
try {
|
||||
JDBCsetUp.executePreparedUpdate(
|
||||
"UPDATE " + Tables.serverInfo() + " SET enable=0, last_update=? WHERE id=?",
|
||||
System.currentTimeMillis(), JdbcConfig.SERVER_ID.get());
|
||||
} catch (Exception e) {
|
||||
PlayerSync.LOGGER.warn("[crash-recovery] could not mark server stopped: {}", e.getMessage());
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
// NEVER let the hook throw — it would block JVM exit.
|
||||
PlayerSync.LOGGER.error("[crash-recovery] hook failed", t);
|
||||
}
|
||||
}, "PlayerSync-shutdown-hook");
|
||||
hook.setDaemon(false); // MUST be non-daemon: daemon threads are killed on exit
|
||||
Runtime.getRuntime().addShutdownHook(hook);
|
||||
PlayerSync.LOGGER.info("[crash-recovery] JVM shutdown hook installed");
|
||||
}
|
||||
|
||||
/**
|
||||
* Scans {@code player_data} for orphaned online=1 rows on this server and
|
||||
* clears them. Called from PlayerSync.onServerStarting AFTER the tables are
|
||||
* created. This is the recovery path for players who were online when the
|
||||
* server was killed ungracefully (kill -9, OOM, host reboot).
|
||||
*/
|
||||
public static void clearOrphanedOnlineFlags() {
|
||||
int serverId = JdbcConfig.SERVER_ID.get();
|
||||
try {
|
||||
// Count first so we know what we're about to clear.
|
||||
int count = 0;
|
||||
try (JDBCsetUp.QueryResult qr = JDBCsetUp.executePreparedQuery(
|
||||
"SELECT COUNT(*) AS c FROM " + Tables.playerData() + " WHERE last_server=? AND online=1",
|
||||
serverId)) {
|
||||
ResultSet rs = qr.resultSet();
|
||||
if (rs.next()) count = rs.getInt("c");
|
||||
}
|
||||
JDBCsetUp.executePreparedUpdate(
|
||||
"UPDATE " + Tables.playerData() + " SET online=0 WHERE last_server=? AND online=1",
|
||||
serverId);
|
||||
if (count > 0) {
|
||||
PlayerSync.LOGGER.warn("[crash-recovery] cleared {} orphan online=1 rows from previous session (server_id={})",
|
||||
count, serverId);
|
||||
SyncLogger.playerEvent("SYSTEM", "ORPHAN_CLEAR",
|
||||
"Cleared " + count + " online=1 rows left by previous session crash");
|
||||
} else {
|
||||
PlayerSync.LOGGER.info("[crash-recovery] no orphan online=1 rows found — previous shutdown was clean");
|
||||
}
|
||||
} catch (Exception e) {
|
||||
PlayerSync.LOGGER.error("[crash-recovery] failed to scan for orphans", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reports peer servers whose heartbeat is stale. Informational — useful to
|
||||
* surface zombie server_ids that could trip doPlayerJoin's poll. Called once
|
||||
* on startup.
|
||||
*/
|
||||
public static void reportZombiePeers(long staleAfterMs) {
|
||||
try (JDBCsetUp.QueryResult qr = JDBCsetUp.executePreparedQuery(
|
||||
"SELECT id, last_update FROM " + Tables.serverInfo() + " WHERE enable=1 AND id<>?",
|
||||
JdbcConfig.SERVER_ID.get())) {
|
||||
ResultSet rs = qr.resultSet();
|
||||
long now = System.currentTimeMillis();
|
||||
int zombies = 0;
|
||||
while (rs.next()) {
|
||||
int id = rs.getInt("id");
|
||||
long last = rs.getLong("last_update");
|
||||
long age = now - last;
|
||||
if (id == 0 || age > staleAfterMs) {
|
||||
zombies++;
|
||||
PlayerSync.LOGGER.warn("[crash-recovery] peer server_id={} is zombie (last_update age={}ms, enabled=true)",
|
||||
id, age);
|
||||
}
|
||||
}
|
||||
if (zombies > 0) {
|
||||
SyncLogger.playerEvent("SYSTEM", "ZOMBIE_PEERS", zombies + " peer server(s) appear stale");
|
||||
}
|
||||
} catch (Exception e) {
|
||||
PlayerSync.LOGGER.warn("[crash-recovery] zombie peer scan failed: {}", e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
package vip.fubuki.playersync.util;
|
||||
|
||||
import vip.fubuki.playersync.PlayerSync;
|
||||
import vip.fubuki.playersync.config.JdbcConfig;
|
||||
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
/**
|
||||
* Periodic {@code server_info.last_update} heartbeat.
|
||||
*
|
||||
* <p>Runs on a dedicated single-threaded scheduler at a fixed interval so peer
|
||||
* servers can detect this server as alive via {@code isPeerServerStale()} in
|
||||
* {@code VanillaSync.doPlayerJoin}. Without this, a server that stops issuing
|
||||
* updates (e.g. hung main thread) would be treated as alive indefinitely by
|
||||
* rejoining players on other servers, causing the 30s poll timeouts seen in
|
||||
* production logs.
|
||||
*
|
||||
* @author vyrriox
|
||||
*/
|
||||
public final class HeartbeatService {
|
||||
|
||||
private HeartbeatService() {}
|
||||
|
||||
/** Heartbeat period: 10s. Short enough that a 60s staleness threshold catches real outages. */
|
||||
private static final long PERIOD_MS = 10_000L;
|
||||
|
||||
private static final AtomicBoolean RUNNING = new AtomicBoolean(false);
|
||||
private static ScheduledExecutorService scheduler;
|
||||
|
||||
public static void start() {
|
||||
if (!RUNNING.compareAndSet(false, true)) return;
|
||||
scheduler = Executors.newSingleThreadScheduledExecutor(r -> {
|
||||
Thread t = new Thread(r, "PlayerSync-heartbeat");
|
||||
t.setDaemon(true);
|
||||
t.setPriority(Thread.MIN_PRIORITY);
|
||||
return t;
|
||||
});
|
||||
scheduler.scheduleAtFixedRate(HeartbeatService::tick, PERIOD_MS, PERIOD_MS, TimeUnit.MILLISECONDS);
|
||||
PlayerSync.LOGGER.info("[heartbeat] started (period={}ms, server_id={})", PERIOD_MS, JdbcConfig.SERVER_ID.get());
|
||||
}
|
||||
|
||||
public static void stop() {
|
||||
if (!RUNNING.compareAndSet(true, false)) return;
|
||||
if (scheduler != null) {
|
||||
scheduler.shutdownNow();
|
||||
scheduler = null;
|
||||
}
|
||||
PlayerSync.LOGGER.info("[heartbeat] stopped");
|
||||
}
|
||||
|
||||
private static void tick() {
|
||||
try {
|
||||
int serverId = JdbcConfig.SERVER_ID.get();
|
||||
JDBCsetUp.executePreparedUpdate(
|
||||
"UPDATE " + Tables.serverInfo() + " SET last_update=?, enable=1 WHERE id=?",
|
||||
System.currentTimeMillis(), serverId);
|
||||
} catch (Throwable t) {
|
||||
// Do not kill the scheduler on a transient DB error — log and retry next tick.
|
||||
PlayerSync.LOGGER.warn("[heartbeat] tick failed: {}", t.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user