From 024342910d3db83787d984f7c534cac33fab1c24 Mon Sep 17 00:00:00 2001
From: Winlin <winlinvip@gmail.com>
Date: Thu, 5 Mar 2026 09:57:08 -0500
Subject: [PATCH] OpenClaw: add and refine ST knowledge-base and
 learning/review skills (#4643)

- Add a comprehensive ST knowledge base document:
- openclaw/memory/srs-coroutines.md
- Add ST-focused developer skill:
- openclaw/skills/st-develop/SKILL.md
- openclaw/skills/st-develop/scripts/verify.sh
- Add KB workflow skills that support ST documentation quality and
learning:
- openclaw/skills/kb-review/SKILL.md
- openclaw/skills/srs-learn/SKILL.md
- Update openclaw/skills/srs-support/SKILL.md to use dynamic SRS_ROOT
path resolution, improving portability for KB/source
 loading.

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: chatgpt-codex-connector[bot] <199175422+chatgpt-codex-connector[bot]@users.noreply.github.com>
---
 .agents/skills                                |   1 +
 .claude/skills                                |   1 +
 .gitignore                                    |   4 +-
 .vscode/launch.json                           |  29 +-
 .vscode/settings.json                         |   6 +-
 .vscode/tasks.json                            |  13 +-
 cmake/CMakeLists.txt                          |   5 +
 openclaw/.gitignore                           |   1 +
 openclaw/MEMORY.md                            |   6 +-
 openclaw/memory/2026-02-06.md                 |   2 +-
 openclaw/memory/srs-coroutines.md             | 928 ++++++++++++++++++
 openclaw/skills/kb-review/SKILL.md            |  68 ++
 openclaw/skills/srs-learn/SKILL.md            | 134 +++
 openclaw/skills/srs-support/SKILL.md          |  17 +-
 openclaw/skills/st-develop/SKILL.md           |  67 ++
 openclaw/skills/st-develop/scripts/verify.sh  |  26 +
 .../blog/2024-10-18-Hidden-Flaws-of-SRS.md    | 595 +++++++++++
 trunk/3rdparty/st-srs/.gitignore              |   2 +
 .../{ide/st_clion => cmake}/CMakeLists.txt    |  17 +-
 trunk/3rdparty/st-srs/event.c                 |   5 +-
 trunk/3rdparty/st-srs/key.c                   |   4 +-
 trunk/3rdparty/st-srs/md_darwin.S             |   5 +-
 trunk/3rdparty/st-srs/md_linux2.S             |   5 +-
 trunk/3rdparty/st-srs/utest/Makefile          |  16 +-
 trunk/3rdparty/st-srs/utest/st_utest.hpp      |   3 +-
 .../st-srs/utest/st_utest_learn_kb.cpp        | 802 +++++++++++++++
 trunk/3rdparty/st-srs/utest/st_utest_tcp.cpp  |   3 +-
 trunk/doc/CHANGELOG.md                        |   1 +
 28 files changed, 2726 insertions(+), 40 deletions(-)
 create mode 120000 .agents/skills
 create mode 120000 .claude/skills
 create mode 100644 cmake/CMakeLists.txt
 create mode 100644 openclaw/.gitignore
 create mode 100644 openclaw/memory/srs-coroutines.md
 create mode 100644 openclaw/skills/kb-review/SKILL.md
 create mode 100644 openclaw/skills/srs-learn/SKILL.md
 create mode 100644 openclaw/skills/st-develop/SKILL.md
 create mode 100755 openclaw/skills/st-develop/scripts/verify.sh
 create mode 100644 trunk/3rdparty/srs-docs/blog/2024-10-18-Hidden-Flaws-of-SRS.md
 rename trunk/3rdparty/st-srs/{ide/st_clion => cmake}/CMakeLists.txt (88%)
 create mode 100644 trunk/3rdparty/st-srs/utest/st_utest_learn_kb.cpp

diff --git a/.agents/skills b/.agents/skills
new file mode 120000
index 000000000..aa16b1bf4
--- /dev/null
+++ b/.agents/skills
@@ -0,0 +1 @@
+../openclaw/skills
\ No newline at end of file
diff --git a/.claude/skills b/.claude/skills
new file mode 120000
index 000000000..aa16b1bf4
--- /dev/null
+++ b/.claude/skills
@@ -0,0 +1 @@
+../openclaw/skills
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index b934528a6..1653198e9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,6 @@
 
 
 cmake-build-debug
-/build/
+/build
+/cmake/build
 /trunk/cmake/build
-
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 6e4777112..2c67c3871 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -5,7 +5,7 @@
           "name": "Debug SRS with conf/console.conf",
           "type": "cppdbg",
           "request": "launch",
-          "program": "${workspaceFolder}/trunk/cmake/build/srs",
+          "program": "${workspaceFolder}/cmake/build/srs-build/srs",
           "args": ["-c", "conf/console.conf"],
           "stopAtEntry": false,
           "cwd": "${workspaceFolder}/trunk",
@@ -33,7 +33,7 @@
           "name": "Debug SRS with conf/rtc.conf",
           "type": "cppdbg",
           "request": "launch",
-          "program": "${workspaceFolder}/trunk/cmake/build/srs",
+          "program": "${workspaceFolder}/cmake/build/srs-build/srs",
           "args": ["-c", "conf/rtc.conf"],
           "stopAtEntry": false,
           "cwd": "${workspaceFolder}/trunk",
@@ -58,7 +58,7 @@
           }
         },
         {
-          "name": "Debug srs-proxy",
+          "name": "Debug SRS Proxy Server (Go)",
           "type": "go",
           "request": "launch",
           "mode": "auto",
@@ -69,7 +69,7 @@
           "name": "Debug SRS (macOS, CodeLLDB) console.conf",
           "type": "lldb",
           "request": "launch",
-          "program": "${workspaceFolder}/trunk/cmake/build/srs",
+          "program": "${workspaceFolder}/cmake/build/srs-build/srs",
           "args": ["-c", "console.conf"],
           "cwd": "${workspaceFolder}/trunk",
           "stopOnEntry": false,
@@ -82,16 +82,33 @@
           "sourceLanguages": ["cpp"]
         },
         {
-          "name": "Debug gtest (macOS CodeLLDB)",
+          "name": "Debug SRS gtest (macOS CodeLLDB)",
           "type": "lldb",
           "request": "launch",
-          "program": "${workspaceFolder}/trunk/cmake/build/utest",
+          "program": "${workspaceFolder}/cmake/build/srs-build/utest",
           "args": ["--gtest_filter=*${selectedText}*"],
           "cwd": "${workspaceFolder}/trunk",
           "terminal": "integrated",
           "initCommands": [
             "command script import lldb.formatters.cpp.libcxx"
           ],
+          "preLaunchTask": "build",
+          "env": {},
+          "sourceLanguages": ["cpp"]
+        },
+        {
+          "name": "Debug ST (StateThreads) gtest (macOS CodeLLDB)",
+          "type": "lldb",
+          "request": "launch",
+          "program": "${workspaceFolder}/cmake/build/st-build/st_utest",
+          "args": ["--gtest_filter=*${selectedText}*"],
+          "cwd": "${workspaceFolder}/trunk",
+          "terminal": "integrated",
+          "initCommands": [
+            "command script import lldb.formatters.cpp.libcxx"
+          ],
+          "preLaunchTask": "st-build",
+          "env": {},
           "sourceLanguages": ["cpp"]
         }
     ]
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 0ebeffd84..5431da709 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,11 +1,11 @@
 {
-    "cmake.sourceDirectory": "${workspaceFolder}/trunk/cmake",
-    "cmake.buildDirectory": "${workspaceFolder}/trunk/cmake/build",
+    "cmake.sourceDirectory": "${workspaceFolder}/cmake",
+    "cmake.buildDirectory": "${workspaceFolder}/cmake/build",
     "cmake.configureOnOpen": false,
     "cmake.ctest.testExplorerIntegrationEnabled": false,
     "testMate.cpp.test.advancedExecutables": [ 
         "{build,Build,BUILD,out,Out,OUT}/**/*{test,Test,TEST}*",
-        "${workspaceFolder}/trunk/cmake/build/**/*{utest,test,Test,TEST}*"
+        "${workspaceFolder}/cmake/build/**/*{utest,test,Test,TEST}*"
     ],
     "files.associations": {
         "vector": "cpp",
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
index 3e439de19..ca55beabf 100644
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@@ -4,13 +4,24 @@
       {
         "label": "build",
         "type": "shell",
-        "command": "cmake --build ${workspaceFolder}/trunk/cmake/build",
+        "command": "cd ${workspaceFolder}/cmake/build && cmake --build . --target srs utest",
         "group": {
           "kind": "build",
           "isDefault": true
         },
         "problemMatcher": ["$gcc"],
         "detail": "Build SRS by cmake."
+      },
+      {
+        "label": "st-build",
+        "type": "shell",
+        "command": "cd ${workspaceFolder}/cmake/build && cmake --build . --target st_utest",
+        "group": {
+          "kind": "build",
+          "isDefault": true
+        },
+        "problemMatcher": ["$gcc"],
+        "detail": "Build ST by cmake."
       }
     ]
   }
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
new file mode 100644
index 000000000..d478ec712
--- /dev/null
+++ b/cmake/CMakeLists.txt
@@ -0,0 +1,5 @@
+cmake_minimum_required(VERSION 3.10)
+project(srs-all)
+
+add_subdirectory(../trunk/cmake srs-build)
+add_subdirectory(../trunk/3rdparty/st-srs/cmake st-build)
diff --git a/openclaw/.gitignore b/openclaw/.gitignore
new file mode 100644
index 000000000..5083e56e8
--- /dev/null
+++ b/openclaw/.gitignore
@@ -0,0 +1 @@
+.openclaw/workspace-state.json
diff --git a/openclaw/MEMORY.md b/openclaw/MEMORY.md
index a586c55da..2bdb99a89 100644
--- a/openclaw/MEMORY.md
+++ b/openclaw/MEMORY.md
@@ -1,7 +1,7 @@
 # MEMORY.md - SRSBot's Long-Term Memory
 
 ## Workspace Conventions
-- Git commit titles start with: `Openclaw:`
+- Git commit titles start with: `OpenClaw:`
 - **No auto-commit** — Never automatically git commit. Only commit when William explicitly tells me to.
 - **No guessing** — William will teach me everything about SRS. Don't speculate or fill in gaps. Wait for him to explain.
 
@@ -29,6 +29,9 @@
 - Open source sustainability and contributor experience
 - Real-time media protocols, architecture, performance
 
+## Formatting Preferences
+- **Markdown headings:** Only use `#` and `##`. Never use `###` or deeper — use **bold text** instead for sub-sections.
+
 ## Content Preferences
 **YouTube videos (title, description, and scripts):** Always use problem-solving structure:
 1. What's wrong?
@@ -61,6 +64,7 @@ The three layers are what William controls; the external conditions are what the
 ## SRS Knowledge Base
 Detailed SRS knowledge in `memory/srs-*.md` files:
 - `srs-overview.md` — What SRS is, protocols, ecosystem tools, and **Features section** with all SRS features, versions, and dates
+- `srs-coroutines.md` — State Threads (ST) coroutine library, why SRS uses coroutines, how coroutine switching works, maintenance burden (platform matrix, Windows/SEH), and multi-CPU strategy (cluster > multi-threading)
 
 ### Rule: Keep Feature List Updated
 When creating new features, updating protocols, or making changes to SRS capabilities, **always update the Features section in `memory/srs-overview.md`** with the feature name, description, version, and date.
diff --git a/openclaw/memory/2026-02-06.md b/openclaw/memory/2026-02-06.md
index 11d690a13..b79d6350b 100644
--- a/openclaw/memory/2026-02-06.md
+++ b/openclaw/memory/2026-02-06.md
@@ -1,7 +1,7 @@
 # 2026-02-06 — Daily Log
 
 ## Commit Convention
-- Commit titles in this workspace should start with: `Openclaw:`
+- Commit titles in this workspace should start with: `OpenClaw:`
 
 ## Why Build an AI Knowledge Base for SRS
 
diff --git a/openclaw/memory/srs-coroutines.md b/openclaw/memory/srs-coroutines.md
new file mode 100644
index 000000000..83497d3e8
--- /dev/null
+++ b/openclaw/memory/srs-coroutines.md
@@ -0,0 +1,928 @@
+# SRS Coroutines
+
+SRS uses **State Threads (ST)** — a C coroutine library that provides lightweight user-space threads. It is the cornerstone of SRS's architecture.
+
+**Key insight:** ST gives SRS the programming model of Go (one coroutine per connection, sequential code, state in local variables) but in C/C++. It's essentially a C implementation of Go's concurrency model, used by SRS's C++ codebase.
+
+This is why the code is maintainable despite handling thousands of concurrent connections — each connection handler reads like a simple sequential function.
+
+## Why a Media Server Needs to Manage State per Connection
+
+A media server must serve many connections simultaneously — thousands of RTMP, HTTP, WebRTC clients at once. Each connection has **state**: handshake data, protocol parameters, stream URLs, buffers. This state lives in local variables and function call stacks.
+
+Example: An RTMP client connects → you accept the TCP connection → do RTMP handshake (read bytes, generate response, store handshake state) → then enter the RTMP connect phase (read TC URL, host, stream ID, parameters). All of this is state for that one connection.
+
+Now multiply by thousands of connections. You need to serve one, switch to another, come back to the first — like talking to hundreds of people at once, each conversation having its own context.
+
+## Three Approaches to Managing Per-Connection State
+
+**1. OS Threads (one per connection)**
+- Each thread has its own call stack and local variables → state is naturally stored
+- Writing code is easy — just sequential logic per connection
+- **Problem:** OS threads are expensive. Thousands of threads = expensive context switching, poor performance
+
+**2. Async/Event Loop (Nginx model)**
+- Single thread, big poll loop: poll/wait → serve connection A → save state → return → serve connection B → ...
+- **Problem:** You must manually save ALL state into a connection struct because local variables are destroyed when you return from the function. Code becomes much more complex to maintain.
+
+**3. Coroutines / User-Space Threads (Go model, SRS model)**
+- Lightweight threads in user space — same benefits as OS threads (local variables, call stacks, natural state storage) but without the OS overhead
+- Each connection gets its own coroutine with its own stack
+- Code reads like simple sequential logic (like OS threads) but performs like async
+- **This is what Go does with goroutines. SRS does the same thing using the State Threads (ST) library — a C library used by SRS's C++ codebase.**
+
+## How Coroutine Switching Works
+
+The tradeoff: coroutines make application code easy, but **someone has to implement the coroutine mechanism itself**.
+
+When serving a connection, you can't just call a function to switch to another connection — that would push onto the same call stack. Instead, you need a **lightweight thread switch**: save the current coroutine's CPU registers to its memory, then load the target coroutine's saved registers and resume from where it left off.
+
+This is the same concept as OS thread context switching, but:
+- **OS thread switch:** heavy, involves kernel, expensive
+- **Coroutine switch:** user-space only, just save/restore registers, very cheap
+
+ST originally used libc's `setjmp`/`longjmp` for context switching. But glibc later started encrypting (mangling) the saved context for security, making it impossible to manipulate the stack pointer from user code. So ST had to reimplement setjmp/longjmp in pure assembly — that's what `_st_md_cxt_save`/`_st_md_cxt_restore` are. They do exactly what setjmp/longjmp do (save and restore callee-saved registers, stack pointer, and program counter) but without glibc's encryption, giving ST full control over coroutine stacks.
+
+To implement this, you need to understand how function calls work at the CPU level — registers, stack pointers, program counters. The coroutine library handles all of this so application code never has to think about it.
+
+## Timeout Heap: How ST Manages Sleeping Coroutines
+
+When a coroutine calls `st_usleep()` or any I/O function with a timeout, ST puts it to sleep and must wake it at the right time. This requires a data structure that efficiently tracks all sleeping coroutines ordered by their wake-up time.
+
+**Original design (pre-1.5):** A sorted linked list. Insertion was O(N) — for every new sleeper, ST walked the list to find the right position. With thousands of sleeping coroutines, this became a bottleneck.
+
+**Current design (since ST 1.5):** A binary heap implemented as a balanced binary tree using pointers (not an array). This gives O(log N) insertion and removal. In benchmarks, 1 million sleep queue insertions/removals dropped from 100 seconds to 12 seconds.
+
+**Why a pointer-based tree instead of an array?** ST's codebase is structured around linking `_st_thread_t` objects via embedded pointers — no auxiliary data structures. The heap reuses this pattern: each thread object has `left` and `right` child pointers, and `_st_this_vp.sleep_q` points to the root (the thread with the earliest timeout). The tree stays fully balanced and left-adjusted, numbered like an implicit array heap (node N has children 2N and 2N+1), but navigated via pointers from root to leaves using the binary digits of the target index.
+
+**The heap invariant:** Parents always time out before children, so the root is always the next coroutine to wake. This is how ST's scheduler knows which timer fires next when it enters epoll/kqueue — it just checks the root's timeout.
+
+## ST Library Origin and Design
+
+State Threads is derived from Netscape Portable Runtime (NSPR). It's not a general-purpose threading library — it specifically targets Internet Applications (servers that are network I/O driven).
+
+**License:** ST is dual-licensed under **MPL 1.1 or GPLv2** (user's choice). Example code is BSD-licensed. MPL 1.1 is a weak copyleft — changes to MPL-licensed *files* must stay open, but MPL code can be combined with code under other licenses (including proprietary). This is compatible with SRS's MIT license. The GPLv2 option is there for projects that prefer GPL-family licensing.
+
+Key design properties:
+- **Deterministic scheduling:** Context switch can only happen at I/O points or explicit synchronization points — never preemptive, never time-sliced
+- **No locks needed:** Because switching is deterministic, global data doesn't need mutex protection in most cases. The entire application can freely use static variables and non-reentrant library functions
+- **Minimal syscalls:** No per-thread signal mask (unlike POSIX threads), so no save/restore of signal mask on context switch — eliminates two syscalls per switch
+- **~5000 lines of code:** Small enough to understand completely, but requires assembly per CPU/OS platform
+
+SRS maintains the fork at `ossrs/state-threads` (branch `srs`), continuously updating it to support modern CPUs and OSes including Linux, macOS, Windows, and architectures like x86_64, ARMv7, AARCH64, Apple M1, RISC-V, LoongArch, and MIPS.
+
+## Non-Network I/O: The Disk Read Problem
+
+ST's non-blocking I/O only works for network sockets. **Disk I/O, device I/O, and stdin all block the entire process** — every coroutine stalls until the operation completes. Disk writes are usually fine (they go to buffer cache), but disk reads can block for unpredictable durations. This is a known limitation of ST's architecture that still exists in SRS.
+
+## ST's Key Constraint and Design Tradeoffs
+
+ST has one fundamental constraint: **all socket I/O must use ST's own I/O functions** (`st_read`, `st_write`, `st_accept`, etc.). If application code calls raw `read(2)` or `write(2)` on a socket, it bypasses ST's scheduler — the entire process blocks, and all coroutines stall. This is why integrating third-party libraries (like libsrt) requires wrapping their I/O to be coroutine-aware (see "Coroutine-Native SRT" above).
+
+**Signal handling:** ST's scheduler only detects two event types: I/O readiness and timeouts. To handle signals (like `SIGINT` or `SIGUSR1`), the standard pattern is to convert them to I/O events — the signal handler writes a byte to a pipe, and a coroutine reads from that pipe via `st_read`. This works because `write(2)` is async-signal-safe.
+
+## The Burden: Maintaining a C Coroutine Library
+
+Coroutines are a fantastic idea for a media server, but unlike Go (where goroutines are built into the language and runtime), **C/C++ has no standard coroutine library for this model**. (Note: C++20 co_await/co_yield is a different mechanism — not the same as user-space threads with full stacks.)
+
+**Platform Support Matrix**
+The coroutine switch must be implemented in **assembly language per CPU architecture**: ARM, ARMv8/AArch64, x86_64, MIPS — each has different register conventions. Multiply by OS (Linux, macOS, Windows) and you get a support matrix that is a maintenance burden.
+
+Nobody else actively maintains this library — SRS must maintain it ourselves. Very few people understand coroutine switching at this level.
+
+**The Windows/SRT Problem (Why SRS 6 Dropped Windows)**
+- SRS added Windows support using a custom coroutine implementation (fiber/win64-based)
+- SRT (Secure Reliable Transport) is a multi-thread library that uses **C++ exceptions**
+- On Windows, C++ exceptions use a platform-specific mechanism (SEH — Structured Exception Handling) that **conflicts with the coroutine stack switching**
+- This caused crashes that were extremely difficult to diagnose
+- William investigated but could not fix it — the interaction between SEH and custom stacks is poorly documented
+- **Result:** SRS 6 removed Windows support because SRT + coroutines couldn't coexist on Windows
+- **Important distinction:** SRS (the server) dropped Windows support, but ST (the coroutine library) still retains Cygwin64 support — Cygwin64-related files were not removed from ST. ST is a standalone coroutine library and there's no reason to remove working platform support from it just because SRS the server no longer targets Windows.
+
+**Toolchain Gap**
+Go provides built-in tools: goroutine stack traces, scheduling profilers, debuggers that understand goroutines. ST has a simple coroutine scheduler driven by I/O events and timers (not an OS thread scheduler), and includes basic `DEBUG_STATS` instrumentation (scheduler timing distribution, thread run/idle/yield counts, per-I/O-call and EAGAIN stats, epoll dispatch stats). But compared to Go's tooling:
+- ST includes **GDB helper scripts**: `nn_coroutines` (show count of coroutines) and `show_coroutines` (list all coroutines with their caller functions). These provide basic coroutine-aware debugging within GDB. However, compared to Go's integrated tooling (goroutine stack traces in panics, `runtime/pprof`, scheduler tracing), these are manual GDB extensions rather than native runtime instrumentation.
+- No high-level performance analysis or visualization for coroutine scheduling
+- Instrumentation exists but is basic counters, not integrated tooling
+
+**Debugging and Profiling Limitations**
+- `perf -g` (stack traces) does not work with ST because ST modifies the stack pointer (SP), breaking frame pointer-based stack walking
+- Valgrind requires ST-specific hooks, supported since SRS 3+
+- ASAN (Address Sanitizer) is supported since SRS 5+, enabled by default in SRS 5, disabled by default in SRS 6 because it sometimes causes crashes for unknown reasons
+- **Testing:** ST has a unit test suite using Google Test (gtest), with code coverage via gcov/gcovr. Tests can be built with `make linux-debug-utest` or `make darwin-debug-utest`.
+- These tools help but are workarounds — there are still no native tools that understand coroutine scheduling the way Go's runtime tools understand goroutines
+
+**Can AI Help?**
+This is a niche domain — not common knowledge. But AI has access to all the code, assembly specs, and documentation. There's hope that AI could maintain the coroutine library (especially for new CPU/OS ports), but it's unproven. The Windows/SEH problem is an example of something that might be too complex even for AI — or might be exactly where AI excels.
+
+## Valgrind Support
+
+Valgrind can't track ST coroutines by default because `setjmp`/`longjmp` switches the stack pointer to custom-allocated stacks that Valgrind doesn't know about, causing false positives.
+
+**The fix** (merged from [toffaletti's fork](https://github.com/toffaletti/state-threads/commit/7f57fc9acc05e657bca1223f1e5b9b1a45ed929b), [commit 4cca7a0](https://github.com/ossrs/state-threads/commit/4cca7a0272b70b184742dd68065af8a9a42e030f)):
+- Uses `VALGRIND_STACK_REGISTER(top, bottom)` when creating a coroutine to tell Valgrind about the custom stack
+- Uses `VALGRIND_STACK_DEREGISTER(id)` when the coroutine exits
+- Stores the registration ID in `_st_stack_t.valgrind_stack_id`
+- Skips the primordial thread (its stack is the normal process stack, already known to Valgrind)
+
+**Opt-in via compile flag:** `-DMD_VALGRIND -I/usr/local/include`. Zero overhead when not enabled — `NVALGRIND` is defined by default to disable all Valgrind macros.
+
+**What changed (3 files):**
+- **common.h** — Added `MD_VALGRIND`/`NVALGRIND` macro logic; added `valgrind_stack_id` field to `_st_stack_t`
+- **sched.c** — Included `<valgrind/valgrind.h>`; added `VALGRIND_STACK_REGISTER` in `st_thread_create()` and `VALGRIND_STACK_DEREGISTER` in `st_thread_exit()`
+- **README** — Added build instructions for Linux with Valgrind
+
+## Stack Memory Management: Cache vs Free
+
+By default, ST caches all thread stacks forever — when a coroutine exits, its stack goes onto a free list and gets reused by the next `_st_stack_new` call. This is efficient for long-running servers with stable thread counts, but wastes memory when threads are short-lived (stacks accumulate and never shrink).
+
+**Compile-time flag `MD_CACHE_STACK`** ([state-threads#38](https://github.com/ossrs/state-threads/issues/38), [commit b019860](https://github.com/ossrs/state-threads/commit/b01986064cf01de86cea7b24a2f95e7114ba3d75)) controls the behavior:
+
+- **With `MD_CACHE_STACK`** (original behavior): Freed stacks stay on the free list. `_st_stack_new` searches the list for a stack of sufficient size before allocating a new one.
+- **Without `MD_CACHE_STACK`**: Stacks are actually freed (`munmap`/`free`). When `_st_stack_new` runs, it first drains the entire free list — unmapping every cached stack — then allocates fresh.
+
+**Why not free immediately in `_st_stack_free`?** When a coroutine exits, it's still *running on its own stack* during cleanup. Freeing the stack out from under a running coroutine would crash. So `_st_stack_free` always appends to the free list, and the actual deallocation happens later in `_st_stack_new` (when a different coroutine is running on a different stack). The re-enabled `_st_delete_stk_segment` function handles the actual `munmap` or `free`.
+
+## Coroutine-Native SRT
+
+SRS 4.0 (2019) added SRT support, but the initial implementation used libsrt's own threads and async I/O, separate from ST. This caused complex async code was difficult to maintain.
+
+In SRS 5.0, SRT was rewritten to be **coroutine-native** ([srs#3010](https://github.com/ossrs/srs/pull/3010)). The pattern for making any protocol coroutine-native:
+
+1. Call the protocol's API (e.g., `srt_recvmsg`)
+2. If success, return the data
+3. If the error is not "would block" (e.g., `SRT_EASYNCRCV`), return the error
+4. If "would block", switch the current coroutine via `st_cond_t` condition variable and let other coroutines run
+5. When the fd becomes ready (detected by `srt_epoll_uwait` in a poller coroutine), signal the condition variable to wake the waiting coroutine
+6. Repeat from step 1
+
+This is the same pattern ST uses internally for TCP (`st_read` handles `EAGAIN` the same way), just adapted to SRT's epoll API.
+
+**The maintainability win:** In callback/async style, connection state must live in global data structures and gets modified by different event callbacks — the object lifecycle is scattered across the event loop. In coroutine-native style, state lives in local variables on the coroutine stack, and the lifecycle is linear and contained in one coroutine function. This is the fundamental reason SRS uses coroutines.
+
+**Remaining issue:** libsrt uses C++ exceptions internally, which still causes the Windows/SEH compatibility problem described above. The coroutine-native rewrite solved the threading and maintainability issues but did not solve Windows portability. The fix requires either rewriting libsrt to avoid C++ exceptions or fixing the SEH/coroutine stack interaction on Windows. Not fixed yet, planned for the future.
+
+## Multi-CPU: Cluster, Not Multi-Threading
+
+**Problem:** SRS uses single-threaded coroutines → only saturates one CPU core. Modern servers have many cores.
+
+**Why Not Multi-Threading?**
+ST library actually supports multi-threading, and William added multi-thread support. But it turned out to be a disaster:
+
+- Even with thread-local isolation (separate thread-safe coroutine schedulers), threads must still **communicate** with each other
+- **The biggest problem: load balancing between threads is nearly impossible to estimate.** Different threads have different capacity, and you can't easily observe the load distribution
+- With single-thread: observing load is trivial — one CPU, 60-70% threshold, done
+- With multi-thread: complexity explodes, load becomes opaque
+- **William's verdict: multi-threading doesn't solve the multi-CPU problem — it creates new, worse problems. It's a trauma maker.**
+
+**The Right Solution: Proxy + Origin + Edge Cluster**
+
+This is a **settled and confirmed decision**: SRS will remain single-process, single-threaded with coroutines. Multi-threading will be removed from SRS. The multi-CPU problem is solved entirely by the cluster architecture:
+
+- **Proxy** (implemented in Go): Stateless, horizontally scalable, synchronizes state through Redis. Supports all protocols (RTMP/FLV/HLS/SRT/WebRTC). Proxies API and media traffic to Origin servers.
+- **Origin** (SRS, C++): Single-threaded with coroutines. Handles stream processing and protocol conversion.
+- **Edge** (SRS, C++): Single-threaded with coroutines. Caches streams from Origin for massive playback distribution.
+
+Multiple Origins behind a Proxy, combined with Edge servers, can scale to thousands of streams and tens of thousands of viewers per stream. Each component stays simple and observable — one CPU, one process, coroutines.
+
+## Multi-threading Timeline (Historical)
+
+SRS has traditionally been single-process, single-threaded, akin to a single-process version of Nginx, with the addition of coroutines for concurrent processing. Coroutines are implemented using the StateThreads library, which has been modified to support thread-local functionality for operation in a multi-threaded environment.
+
+Despite experimenting and analyzing thread-local handling for a media architecture over the years, SRS has not adopted a thread-local approach but rather a different multi-threaded architecture that is still in the planning stage: Stream processing occurs on a single thread, while blocking operations like logging, file writing, and DNS resolution are handled by separate threads. In essence, SRS uses multi-threading to address blocking issues. If Linux supports fully asynchronous I/O in the future, multi-threading may not be necessary, as seen in liburing.
+
+StateThreads multi-threading faces issues with Windows C++ exception handling. Windows' exception mechanism differs from Linux, causing compatibility problems when StateThreads implements setjmp and longjmp, as discussed in SEH.
+
+Challenges with multi-thread scheduling and load balancing: While thread-local multi-threading addresses multi-core utilization, it still limits the need for streaming and playback to a single thread, preventing complete load balancing across multiple threads. Without thread-local functionality, serious locking and competition issues arise. Essentially, it's like running multiple K8s Pods within a single process and handling scheduling, monitoring, and load balancing internally, which can be quite complex.
+
+In SRS 5.0, StateThreads were restructured to support thread-local functionality and initiated a main thread and subthreads to transition the architecture into a multi-threaded model. However, various issues arose during subsequent stages, leading to a default return to a single-threaded architecture in SRS 6.0. Multi-threading capabilities will be removed as the Proxy and Edge cluster architecture fully replaces them.
+
+Additionally, we explored another potential architecture where specific capabilities are distributed across different threads, like using separate threads for WebRTC encryption and decryption. However, this approach transforms into a typical multi-threaded program rather than a thread-local architecture, resulting in performance overhead from locks and reduced stability — not an ideal direction.
+
+## How `__thread` Makes ST Thread-Safe
+
+ST's multi-threading model is simple: **one pthread, one ST scheduler**. It uses GCC's `__thread` so scheduler state is thread-local, not shared global state.
+
+This approach came from [toffaletti's fork](https://github.com/toffaletti/state-threads) and was later adopted by ossrs/state-threads ([state-threads#19](https://github.com/ossrs/state-threads/issues/19)).
+
+In practice, key runtime state is thread-local: current thread, VP/scheduler state, event backend data, free stack list, and key/destructor tables. `st_init()` initializes each thread's runtime (including calling `_st_io_init()` directly). In current code, the netfd freelist is also thread-local (`static __thread _st_netfd_t *_st_netfd_freelist`), so no mutex is needed there.
+
+**Design takeaway:** ST scales by isolation, not heavy locking. Each pthread runs an independent coroutine runtime with its own run queue, timers, and event loop.
+
+**Why SRS still moved away:** This works well inside ST, but SRS still faced hard cross-thread coordination and load-balancing problems at the application level. The project chose Proxy + Origin + Edge cluster architecture for multi-CPU scaling instead.
+
+## Porting ST to New Platforms
+
+Porting ST to a new OS/CPU is simpler than it sounds. The core task is implementing two assembly functions: `_st_md_cxt_save` (save registers) and `_st_md_cxt_restore` (restore registers) — the custom replacements for `setjmp`/`longjmp`.
+
+**Current platform support (from [state-threads#22](https://github.com/ossrs/state-threads/issues/22)):**
+
+- **Linux + i386** — Stable. 32-bit x86 systems.
+- **Linux + x86-64** — Stable. CentOS, Ubuntu server, etc.
+- **Linux + ARM (v7)** — Stable. Raspberry Pi and ARM devices. ([state-threads#1](https://github.com/ossrs/state-threads/issues/1))
+- **Linux + AArch64 (ARMv8)** — Stable. ARM servers. ([state-threads#9](https://github.com/ossrs/state-threads/issues/9))
+- **Linux + MIPS** — Dev. OpenWRT devices. ([state-threads#21](https://github.com/ossrs/state-threads/issues/21))
+- **Linux + MIPS64** — Dev. Loongson 3A4000/3B3000. ([state-threads#21](https://github.com/ossrs/state-threads/issues/21))
+- **Linux + LoongArch64** — Dev. Loongson 3A5000/3B5000, new ISA replacing MIPS. ([state-threads#24](https://github.com/ossrs/state-threads/issues/24))
+- **Linux + RISC-V** — Dev. StarFive boards. ([state-threads#28](https://github.com/ossrs/state-threads/pull/28))
+- **macOS + x86-64** — Stable. Intel Macs. ([state-threads#11](https://github.com/ossrs/state-threads/issues/11))
+- **macOS + AArch64 (M1/M2)** — Dev. Apple Silicon. ([state-threads#30](https://github.com/ossrs/state-threads/issues/30))
+- **Windows + x86-64 (Cygwin64)** — Dev. 64-bit only, no 32-bit Windows. ([state-threads#20](https://github.com/ossrs/state-threads/issues/20))
+
+"Stable" means production-tested in SRS deployments. "Dev" means implemented and working but less field-tested.
+
+**Why custom assembly instead of libc's setjmp/longjmp?**
+Early ST used glibc's `setjmp`, then modified the `jmp_buf` to swap the stack pointer to a heap-allocated coroutine stack. This required knowing glibc's internal `jmp_buf` layout. But newer glibc versions started **encrypting (pointer-mangling)** the saved registers inside `jmp_buf`, making it impossible to modify the SP from user code. The fix: implement save/restore entirely in assembly with ST's own `jmp_buf` layout. This is actually more portable — CPU register ABIs are stable and well-documented, while glibc internals are not. **All platforms now use custom assembly exclusively** — the libc setjmp path has been completely removed (attempting to use it is a compile error). Every OS/CPU goes through `_st_md_cxt_save`/`_st_md_cxt_restore` in the `.S` files.
+
+**Assembly files are organized by OS, not CPU:**
+- `md_linux.S` — Linux x86 platforms: i386, amd64/x86_64
+- `md_linux2.S` — Linux non-x86 platforms: aarch64, arm, riscv, mips64, mips, loongarch64
+- `md_darwin.S` — macOS/Darwin (different calling conventions and object format)
+- `md_cygwin64.S` — Windows via Cygwin64
+
+Within each file, CPU-specific sections are selected by `#ifdef` macros (`__x86_64__`, `__aarch64__`, `__mips__`, `__loongarch64`, `__riscv`, etc.).
+
+> Note: All `.S` files check `MD_ST_NO_ASM` — historically this allowed disabling assembly and falling back to libc's `setjmp`/`longjmp`. Since the libc setjmp path has been removed (all platforms now require assembly), this macro no longer works — defining it will cause linker errors. It remains in the code as a leftover.
+
+**What registers to save?**
+Only the **callee-saved registers** matter — these are the registers a function must preserve across calls. The actual registers saved by ST's assembly (from the `.S` files):
+
+- **i386 (Linux):** ebx, esi, edi, ebp, esp, pc
+- **x86-64 (Linux/Darwin/Cygwin64):** rbx, rbp, r12-r15, rsp, pc
+- **ARM v7 (Linux):** v1-v6, sl, fp, sp, lr (i.e., r4-r9, r10, r11, r13, r14); optionally VFP d8-d15 and iWMMXt wr10-wr15
+- **AArch64 (Linux/Darwin):** x19-x28 (callee-saved), x29 (frame pointer), x30/lr (link register), sp, plus floating-point d8-d15
+- **MIPS/MIPS64 (Linux):** sp, gp, fp/s8, s0-s7, ra
+- **LoongArch64 (Linux):** sp (r3), ra (r1), fp (r22), s0-s8 (r23-r31)
+- **RISC-V (Linux):** sp, ra, fp/s0, s1-s11
+
+**The jmpbuf problem (historical, now resolved):**
+Different platforms define `jmp_buf` differently — field names (`__jmpbuf` vs `__jb`), field sizes, and layouts all varied. This was a problem when ST relied on the platform's `jmp_buf`. [state-threads#29](https://github.com/ossrs/state-threads/pull/29) resolved this by having ST define and use its own `_st_jmp_buf_t` structure (`long[22]` — sized for the largest platform, AArch64) instead of relying on platform-specific layouts. All platforms now use this unified structure, eliminating the cross-platform jmpbuf compatibility issue entirely.
+
+The macro `MD_GET_SP(_t)` in `md.h` defines how to read/write the stack pointer in ST's own jmpbuf for each platform. This is critical for `MD_INIT_CONTEXT` — when creating a coroutine, the SP in the saved context must be updated to point at the heap-allocated stack, since the coroutine can't use the creator's stack.
+
+**Porting toolkit (`tools/` directory):**
+Six utilities help with any new port:
+
+- **`porting.c`** — Prints detected OS/CPU macros, pointer sizes, and calling convention info. Run this first to understand your platform.
+- **`helloworld.c`** — Minimal ST validation: `st_init()` + loop with `st_sleep()`. If this prints, context switching works.
+- **`verify.c`** — Full API test: thread creation, mutex, cond variable, usleep, thread join. Validates the complete ST threading model.
+- **`jmpbuf.c`** — Shows the platform's `jmp_buf` struct definition via preprocessor expansion, useful for understanding field layout differences.
+- **`pcs.c`** — Analyzes the Procedure Call Standard (which registers are caller vs callee-saved).
+- **`stack.c`** — Inspects stack behavior on the platform.
+
+**Porting steps (using MIPS/OpenWRT as the reference example from [state-threads#21](https://github.com/ossrs/state-threads/issues/21)):**
+
+1. **Detect CPU macro:** `g++ -dM -E - </dev/null | grep -i aarch64` to find the `#define` your compiler provides (here `aarch64` is just an example — replace it with your target CPU name, e.g. `mips`, `riscv`, `loongarch`)
+2. **Understand the platform:** Run `tools/porting.c` to see detected OS/CPU macros and pointer sizes. Compile `tools/pcs.c` and use GDB's `si` (step instruction) to step through function call assembly — identify which registers are callee-saved (these are the ones you must save/restore in ST). Also refer to vendor docs (ARM/MIPS/RISC-V reference manuals) for the full callee-saved register list. Optionally run `tools/jmpbuf.c` to see how the platform's libc setjmp saves registers — this is a useful cross-reference for confirming the callee-saved register list, even though ST uses its own jmpbuf and doesn't depend on libc's layout
+3. **Add empty stubs:** In the appropriate `.S` file, add `_st_md_cxt_save` and `_st_md_cxt_restore` under a new `#elif defined(__your_cpu__)` — empty functions that just return. Build `verify.c` and `helloworld.c` to confirm compilation and linking succeed, even though they won't run correctly yet
+4. **Implement the assembly:** Fill in the actual save/restore instructions for each callee-saved register to/from the jmpbuf — e.g., `sw`/`lw` (MIPS32), `sd`/`ld` (MIPS64, RISC-V), `stp`/`ldp` (AArch64), `stmia`/`ldmia` (ARM v7 — block load/store with register lists). `_st_md_cxt_save` returns 0; `_st_md_cxt_restore` sets return value to 1 and jumps to the saved return address
+5. **Define MD_GET_SP:** In `md.h`, add the macro for your platform so `MD_INIT_CONTEXT` can replace the SP with the coroutine's heap-allocated stack address
+6. **Test with helloworld:** If it prints messages with `st_sleep` pauses, context switching works
+7. **Test with verify:** Run `verify.c` for full API test — thread creation, mutex, cond variable, usleep, thread join. Also use it early (after adding empty stubs) to verify compilation and linking before implementing the assembly
+
+**Platform-specific build commands:**
+- Linux: `make linux-debug` (auto-detects CPU)
+- macOS: `make darwin-debug`
+- Windows: `make cygwin64-debug`
+- Force CPU: `make linux-debug EXTRA_CFLAGS="-D__aarch64__"` (if auto-detection fails)
+
+## Future Direction: Refactor ST Internals from C to C++
+
+The current ST codebase is written in C with heavy use of macros and manual struct patterns (embedded linked lists, struct casting for "inheritance", macro-based queue operations). This code is difficult to read and understand — both for humans and AI. The macro layer obscures the actual logic, and C's manual patterns for data structures are not straightforward.
+
+**The plan:** Refactor ST's internal implementation from C to C++, while keeping the external C API unchanged (`st_read`, `st_write`, `st_accept`, etc. remain `extern "C"`). This is an internal rewrite only — no API changes for consumers.
+
+**Why C++:**
+- Replace opaque macros with readable C++ constructs (classes, templates, inline functions)
+- Replace manual linked list macros and struct casting with proper C++ data structures and type safety
+- RAII for resource management (stack allocation/deallocation, fd lifecycle)
+- The code becomes much clearer and more maintainable — critical for AI-managed maintenance
+
+**Why this matters for the AI strategy:**
+- AI can reason about C++ code far more easily than macro-heavy C
+- This directly enables the vision of AI maintaining ST long-term
+- Better code quality → fewer bugs → more confidence in AI-generated changes
+
+**Approach:** Incremental — start with the worst offenders (likely the macros in `common.h` and queue management in `sched.c`), convert piece by piece, verify with tests at each step.
+
+## Can AI Replace RUST for ST Maintenance?
+
+RUST (specifically tokio) is conceptually similar to ST — both are polling-based async with cooperative scheduling. RUST offers advantages: no assembly needed, built-in multi-thread support, cross-platform without manual porting, better tooling. The "Hidden Flaws of SRS" blog explored RUST as a potential future direction.
+
+However, the real question is not about language features but about **ecosystem and maintenance capability**. If AI proves capable of maintaining ST's assembly code — understanding CPU register conventions, porting to new architectures, debugging platform-specific issues like Windows/SEH — then the ST maintenance burden disappears and there's no compelling reason to switch languages. The C++ ecosystem for the media industry (FFmpeg, libsrt, libwebrtc, and other open-source media streaming projects) matters more than language features.
+
+RUST is a fallback path if AI cannot handle the low-level ST maintenance. It's not an inevitable direction. The deciding factor is AI capability, not language preference.
+
+## Backtrace Support for Coroutines
+
+ST supports `backtrace()` and `backtrace_symbols()` for dumping stack traces from within coroutines ([state-threads#34](https://github.com/ossrs/state-threads/issues/34)). Since each coroutine has its own stack, standard backtrace works naturally — you get a full call chain like `bar → foo → start → _st_thread_main → st_thread_create`.
+
+**Usage:** Build and run the example in `tools/backtrace/`. Works on both Linux and Darwin.
+
+**Key details:**
+- On Linux, compile with `-rdynamic` to get function names in `backtrace_symbols()` output; without it you get raw offsets like `(+0x204b)`
+- On Darwin, uses `__builtin_return_address` to walk the stack
+- The return address points to the **instruction after the call** (the return site), so `addr2line` shows the next source line, not the call line itself — this is normal
+- Use `addr2line -C -p -s -f -a -e <binary> <address>` to convert offsets to source file:line
+- Use `nm <binary> | grep <func>` to find function base addresses, then compute offsets
+- `objdump -d <binary>` can verify the relationship between addresses and instructions
+
+This complements ST's GDB helper scripts (`nn_coroutines`, `show_coroutines`) as another debugging tool for coroutine-based code.
+
+## Timeout Semantics
+
+ST timeouts have a subtle but important behavior: the timeout parameter is relative to `last_clock` (the timestamp of the last scheduler cycle), not the moment the function is called.
+
+When you call `st_read(fd, buf, n, timeout)`, internally ST computes the deadline as `due = last_clock + timeout` (in `_st_add_sleep_q`). The `last_clock` value is updated in `_st_vp_check_clock()`, which only runs during scheduler cycles — in the idle thread loop after `dispatch()` returns, and in `st_thread_yield()`. Between those points, `last_clock` is frozen.
+
+**The cancelling effect:** Both the deadline and the `epoll_wait` timeout are computed from the same stale `last_clock`: `due = last_clock + timeout`, `min_timeout = due - last_clock = timeout`. The staleness cancels out — `epoll_wait` always receives the full `timeout` value. On a busy server, frequent I/O keeps `last_clock` fresh and deadlines fire on time. On a quiet server, the actual wait approximates the full `timeout` regardless of staleness. For example:
+
+- `last_clock` was set 8ms ago, you call `st_read()` with a 10ms timeout → `due = last_clock + 10ms` (only 2ms from now), but dispatch computes `min_timeout = due - last_clock = 10ms` → `epoll_wait` blocks for the full 10ms
+- On a busy server with frequent I/O, `last_clock` stays fresh — dispatch would compute `min_timeout = 2ms` and the deadline fires on time
+
+ST timeouts are suitable for coarse-grained purposes — detecting broken connections, idle peers, or stuck operations. Realistic timeouts should be on the order of seconds (e.g., 5s, 30s), where `last_clock` staleness is negligible. They are not designed for precise sub-millisecond timing.
+
+**Related test cases (`st_utest_learn_kb.cpp`):**
+- `TEST(LearnKB, BasicNetfdReadTimeout)`
+- `TEST(LearnKB, CondTimedwaitTimeout)`
+
+## `st_init()` — How the Coroutine World is Built
+
+`st_init()` is the entry point that bootstraps the entire coroutine runtime. It creates the scheduler data structures, the event system, the idle thread, and wraps the calling OS thread as the first coroutine. Here's what happens step by step:
+
+**Step 1: Set event system and initialize I/O.** SRS calls `st_set_eventsys(ST_EVENTSYS_ALT)` before `st_init()` to select the platform-optimal backend — epoll on Linux, kqueue on macOS. Inside `st_init()`, `st_set_eventsys(ST_EVENTSYS_DEFAULT)` is called but returns `EBUSY` (since the event system is already set) and is harmlessly ignored — hence the code comment "We can ignore return value here". Then `_st_io_init()` runs for one-time I/O setup (ignores SIGPIPE, sets fd limits).
+
+**Step 2: Initialize all scheduler queues and create the event system.** First initializes the thread-local free stack list (`_st_free_stacks`), then zeroes the VP struct (`memset(&_st_this_vp, 0, ...)`), then initializes three empty linked lists:
+- `run_q` — coroutines ready to run
+- `io_q` — coroutines blocked on socket I/O
+- `zombie_q` — dead coroutines awaiting cleanup
+
+Then calls `(*_st_eventsys->init)()` to create the actual epoll/kqueue file descriptor. Also captures `pagesize` (for stack guard pages) and `last_clock` (current timestamp for timeout calculations).
+
+**Step 3: Create the Idle Thread.** This is the heart of ST. The idle thread is created via `st_thread_create(_st_idle_thread_start)`, then marked with `_ST_FL_IDLE_THREAD`, decremented from `_st_active_count` (it doesn't count as an "active" coroutine), and removed from the run queue (it's managed specially by the scheduler).
+
+The idle thread's loop is the core scheduler cycle:
+1. `(*_st_eventsys->dispatch)()` — calls `epoll_wait`/`kqueue`, blocking until I/O is ready or the earliest timeout fires. This is the **only place the process truly blocks**.
+2. `_st_vp_check_clock()` — updates `last_clock`, then walks the sleep heap and moves timed-out coroutines to the run queue.
+3. `_st_switch_context(me)` — yields CPU to a ready coroutine from the run queue.
+4. When that coroutine eventually switches back (hits I/O or yields), the idle thread loops again.
+5. When `_st_active_count` drops to 0 (no more coroutines), the idle thread calls `exit(0)`.
+
+**Step 4: Create the Primordial Thread.** The current OS thread (the one calling `st_init()`) is wrapped into an `_st_thread_t` struct via `calloc`. It gets no new stack — it reuses the existing process stack. Its state is set to `_ST_ST_RUNNING`, flagged as `_ST_FL_PRIMORDIAL`, and assigned to `_st_this_thread`. This becomes the first running coroutine and `_st_active_count` is incremented to 1.
+
+**After `st_init()` returns**, the coroutine world is ready: event system initialized, idle thread created and waiting, primordial thread running. Control returns to `main()`, which is now executing as the primordial coroutine. From here, calling `st_thread_create()` spawns new coroutines, and the scheduler workflow activates once those coroutines hit their first I/O call.
+
+**Related test cases (`st_utest_learn_kb.cpp`):**
+- `TEST(LearnKB, EventSysSelectedAndLockedAfterInit)`
+- `TEST(LearnKB, CoroutineRunsOnSeparateStack)`
+- `TEST(LearnKB, StartRoutineNotExecutedInline)`
+- `TEST(LearnKB, JoinDrivesFirstRunWhenNoManualYield)`
+
+## `st_thread_create()` — How a Coroutine is Born
+
+`st_thread_create(start, arg, joinable, stk_size)` allocates a new coroutine, sets up its execution context, and places it on the run queue — all without actually running it yet.
+
+**Step 1: Allocate the stack.** The requested size (default `ST_DEFAULT_STACK_SIZE` = 128KB) is rounded up to page alignment, then `_st_stack_new()` either reuses a stack from the free list or allocates fresh memory (via `mmap` or `malloc`). In DEBUG builds (without `MD_NO_PROTECT`), guard pages are set at both ends of the stack via `mprotect(..., PROT_NONE)` to catch overflow via SIGSEGV; in release builds there are no guard pages.
+
+**Step 2: Carve thread metadata from the top of the stack.** The thread control block (`_st_thread_t`) and per-thread data array (`ptds[ST_KEYS_MAX]`) are placed at the top of the stack, growing downward. Then the stack pointer is 64-byte aligned. The layout from high to low address:
+
+- `ptds[ST_KEYS_MAX]` — per-thread data slots (like thread-local storage)
+- `_st_thread_t` — the thread control block
+- 64-byte alignment padding
+- 128-byte pad (`_ST_STACK_PAD_SIZE`) reserved below the aligned SP
+- `stack->sp` — where the coroutine's actual execution stack begins (grows downward)
+- Guard page at the bottom (DEBUG builds only)
+
+Both `thread` and `ptds` are zeroed with `memset` after being carved from the stack. This is efficient: one allocation provides both the stack and the control block — no separate `malloc` for the thread struct.
+
+**Step 3: Set up the initial context (the core trick).** This is the most subtle part:
+
+```c
+/* Note that we must directly call rather than call any functions. */
+if (_st_md_cxt_save(thread->context)) {
+    _st_thread_main();
+}
+MD_GET_SP(thread) = (long)(stack->sp);
+```
+
+The code comment is a correctness constraint: `_st_md_cxt_save` must be called directly at this site, not wrapped in a helper function. It captures the current PC (return address) — when `_st_md_cxt_restore` later restores this context, execution resumes right here at the `if` check. If `_st_md_cxt_save` were called inside a helper function, the saved PC would point into that helper's frame, and the restored execution would return into a function frame that doesn't exist on the new coroutine's stack — crash.
+
+- `_st_md_cxt_save()` saves the **creator's** current CPU registers into `thread->context` and returns **0** (like `setjmp`)
+- Since it returns 0, the `if` body is **skipped** — `_st_thread_main()` is NOT called now
+- `MD_GET_SP()` then **overwrites the saved stack pointer** in the context to point at the new coroutine's heap-allocated stack
+
+Later, when the scheduler switches to this coroutine via `_st_md_cxt_restore(thread->context)`, it restores these saved registers — but with the **modified SP** pointing at the new stack. The restore returns **1** (non-zero), so the `if` is entered and `_st_thread_main()` executes — now running on the coroutine's own stack. `_st_thread_main()` simply calls `thread->start(thread->arg)` (the user's function), and when it returns, calls `st_thread_exit()`.
+
+This save-then-patch-SP trick is how ST creates a coroutine without running it: capture a register snapshot, swap the stack pointer to the new stack, defer execution until scheduled.
+
+**Step 4: Set up joinability.** If `joinable` is true, a condition variable (`thread->term`) is allocated so another coroutine can call `st_thread_join()` and block until this coroutine finishes.
+
+**Step 5: Make it runnable.** The thread's state is set to `_ST_ST_RUNNABLE`, `_st_active_count` is incremented, and the thread is inserted into `run_q`. The coroutine won't actually execute until the current coroutine yields (hits I/O, sleeps, or calls `st_thread_yield()`), at which point the scheduler picks it off the run queue.
+
+**Valgrind integration:** If `MD_VALGRIND` is enabled and the thread is not the primordial thread, `VALGRIND_STACK_REGISTER()` is called to register the custom stack region with Valgrind, preventing false positives from stack pointer switching.
+
+**Related test cases (`st_utest_learn_kb.cpp`):**
+- `TEST(LearnKB, CoroutineRunsOnSeparateStack)`
+- `TEST(LearnKB, StartRoutineNotExecutedInline)`
+- `TEST(LearnKB, JoinDrivesFirstRunWhenNoManualYield)`
+- `TEST(LearnKB, LocalStatePreservedAcrossYield)`
+- `TEST(LearnKB, ReturnValueThroughJoin)`
+
+## Epoll-Driven I/O Workflow — How Coroutines Sleep and Wake
+
+This section traces exactly what happens when a coroutine does I/O — from `st_read()` through epoll and back. This is the core mechanism that makes ST work.
+
+**The I/O functions all follow the same pattern** (`st_read`, `st_write`, `st_accept`, `st_connect`, `st_recvfrom`, `st_sendto`, `st_recvmsg`, `st_sendmsg`):
+
+1. Try the raw syscall (`read`, `write`, `accept`, etc.) immediately
+2. If it succeeds → return the result. Zero overhead, no coroutine machinery involved
+3. If `EINTR` → retry the syscall
+4. If `EAGAIN`/`EWOULDBLOCK` → the socket isn't ready, enter the coroutine wait path via `st_netfd_poll()`
+5. If any other error → return error
+
+This "try first" design means ST adds zero overhead when data is already available — it's just a normal syscall.
+
+**`st_netfd_poll()` → `st_poll()`:** The poll wrapper converts a single fd into a `struct pollfd` and calls `st_poll()`, which is where the coroutine actually suspends.
+
+**`st_poll()` — The Coroutine Suspension Point (sched.c):**
+
+1. **Register with epoll:** Calls `_st_eventsys->pollset_add(pds, npds)`, which increments per-fd reference counts (`_ST_EPOLL_READ_CNT`, `_ST_EPOLL_WRITE_CNT`, `_ST_EPOLL_EXCEP_CNT` — one per event direction), then computes the new event mask from those counts and calls `epoll_ctl` — `EPOLL_CTL_ADD` if the fd had no prior watchers, or `EPOLL_CTL_MOD` if it already did. Multiple coroutines can watch the same fd — reference counts track this.
+
+2. **Create a poll queue entry on the stack:** A `_st_pollq_t` struct links the pollfd array to the waiting coroutine (`pq.thread = me`) and sets `pq.on_ioq = 1`.
+
+3. **Insert into `io_q`:** The poll queue entry is linked into the global I/O wait list. Later, `_st_epoll_dispatch()` walks this list to find which coroutines to wake.
+
+4. **Add to sleep heap (if timeout specified):** `_st_add_sleep_q(me, timeout)` sets `me->due = last_clock + timeout` and inserts into the binary heap. This ensures the coroutine wakes even if I/O never arrives.
+
+5. **Set state to `_ST_ST_IO_WAIT`** and call `_st_switch_context(me)` — the coroutine suspends. Registers are saved, and the scheduler picks the next runnable coroutine (or the idle thread if nothing else is ready).
+
+**`_st_vp_schedule()` — The Scheduler (sched.c):**
+
+When `_st_switch_context()` is called, `_st_vp_schedule()` decides what runs next:
+- If `run_q` is non-empty → pull the first thread off the head, switch to it
+- If `run_q` is empty → switch to the idle thread
+
+The idle thread is where `epoll_wait` lives — it's the "nobody else has anything to do, so let's wait for I/O" fallback.
+
+**The Idle Thread Loop (sched.c):**
+
+The idle thread runs a tight loop until no active coroutines remain:
+1. `_st_eventsys->dispatch()` → calls `_st_epoll_dispatch()`
+2. `_st_vp_check_clock()` → process expired timeouts
+3. Set self to `RUNNABLE`, call `_st_switch_context()` → yield to a ready coroutine
+4. When that coroutine eventually suspends, we return here and loop
+
+**`_st_epoll_dispatch()` — The Heart of I/O Multiplexing (event.c):**
+
+This function does three things: wait for I/O, wake coroutines, and clean up epoll state.
+
+*Phase 1 — Calculate epoll timeout from sleep heap:*
+- If `sleep_q` is NULL (no sleeping coroutines) → `timeout = -1` (block forever)
+- Otherwise → `timeout = sleep_q->due - last_clock` (wake at earliest deadline)
+- Special case: if timeout computes to 0ms but `min_timeout > 0` (sub-millisecond), round up to 1ms to avoid a spin loop (epoll_wait only has millisecond granularity)
+
+*Phase 2 — `epoll_wait()`:*
+- `nfd = epoll_wait(epfd, evtlist, evtlist_size, timeout)` — **this is the only true blocking point in the entire process**. The OS suspends the process until at least one fd is ready or the timeout expires.
+
+*Phase 3 — Mark fired fds:*
+- For each event returned by epoll, store the fired events in `_ST_EPOLL_REVENTS(osfd)`. If `EPOLLERR` or `EPOLLHUP` is set, also OR in the fd's currently-registered event bits (`_ST_EPOLL_EVENTS(osfd)` — whichever of `EPOLLIN`/`EPOLLOUT`/`EPOLLPRI` have non-zero ref counts) so waiting coroutines see the error.
+
+*Phase 4 — Walk `io_q`, wake matching coroutines:*
+- For each `_st_pollq_t` entry on `io_q`, check each fd in its pollfd array against `_ST_EPOLL_REVENTS`. If any fd has matching events, set `pds->revents` and mark `notify = 1`.
+- If notify: remove the poll queue entry from `io_q` (`pq->on_ioq = 0`), call `_st_epoll_pollset_del()` which decrements reference counts for all fds in the pollfd array and calls `EPOLL_CTL_MOD`/`EPOLL_CTL_DEL` only for fds that did NOT fire (fds with `_ST_EPOLL_REVENTS != 0` are skipped — they are cleaned up later in Phase 5). Then remove the thread from `sleep_q` if present, set `thread->state = _ST_ST_RUNNABLE`, and insert into `run_q`.
+
+*Phase 5 — Clean up fired fds in epoll:*
+- For each event in the epoll result list, clear `_ST_EPOLL_REVENTS`, then either `EPOLL_CTL_MOD` (if other coroutines still watch this fd) or `EPOLL_CTL_DEL` (if no more watchers). This keeps epoll's internal state in sync with ST's reference counts.
+
+**`_st_vp_check_clock()` — Timeout Processing (sched.c):**
+
+After dispatch returns, the idle thread checks for expired timeouts:
+1. Update `last_clock = st_utime()`
+2. Walk the sleep heap root: while `sleep_q->due <= now`, remove the thread from the heap
+3. If the thread was in `_ST_ST_COND_WAIT` state, set the `_ST_FL_TIMEDOUT` flag
+4. Set `thread->state = _ST_ST_RUNNABLE` and insert at the **head** of `run_q` (using `st_clist_insert_after`)
+
+**Priority detail:** Timed-out coroutines go to the head of `run_q`; I/O-ready coroutines go to the tail. Timeouts get priority because they represent deadlines.
+
+**Resumption — Back in `st_poll()`:**
+
+When the scheduler switches back to our coroutine, execution resumes right after the `_st_switch_context()` call in `st_poll()`:
+- If `pq.on_ioq == 0` → dispatch already removed us from `io_q`, I/O is ready. Count fds with non-zero `revents` and return the count.
+- If `pq.on_ioq == 1` → we timed out (woken by `_st_vp_check_clock`, not by dispatch). Remove ourselves from `io_q`, call `pollset_del` to clean up epoll registration, return 0.
+
+Back in `st_read()`, if `st_netfd_poll()` succeeded, the loop retries `read()` — which now succeeds because data is available. The data is returned to the application.
+
+**The `on_ioq` flag is the key mechanism** that distinguishes timeout wakeup from I/O wakeup. The dispatch function clears it (`pq->on_ioq = 0`) when I/O fires; `_st_vp_check_clock` does NOT touch it. So when `st_poll()` resumes, it checks this flag to know why it woke up.
+
+**Reference counting for shared fds:** Multiple coroutines can wait on the same fd (e.g., multiple readers on a UDP socket). `_ST_EPOLL_READ_CNT(fd)`, `_ST_EPOLL_WRITE_CNT(fd)`, and `_ST_EPOLL_EXCEP_CNT(fd)` track how many coroutines watch each direction. `epoll_ctl` is only called when the computed event mask (`_ST_EPOLL_EVENTS(fd)`) changes — i.e., when counts transition between 0 and non-zero. When a coroutine's I/O completes, `_st_epoll_pollset_del` decrements the counts — if all reach 0, the fd is removed from epoll; otherwise it's modified to reflect remaining watchers.
+
+**Related test cases (`st_utest_learn_kb.cpp`):**
+- `TEST(LearnKB, BasicNetfdWriteThenRead)`
+- `TEST(LearnKB, BasicNetfdReadTimeout)`
+
+## `st_usleep()` — Pure Timer-Based Coroutine Sleep
+
+`st_usleep(usecs)` suspends the current coroutine for a specified duration. Unlike I/O functions (`st_read`, `st_write`), it involves **no I/O at all** — the coroutine is placed only in the sleep heap and woken purely by timeout expiration.
+
+**The function (sync.c):**
+
+1. **Check interrupt flag.** If `_ST_FL_INTERRUPT` is set on the current thread, clear it and return `EINTR` immediately — the coroutine was interrupted by `st_thread_interrupt()` before it even started sleeping.
+
+2. **Set state and enter sleep heap.** If a finite timeout is given: set `me->state = _ST_ST_SLEEPING`, then `_st_add_sleep_q(me, usecs)` which computes `me->due = last_clock + usecs`, sets the `_ST_FL_ON_SLEEPQ` flag, assigns a heap index, and inserts into the binary heap (O(log N)). If `ST_UTIME_NO_TIMEOUT` is passed (via `st_sleep(-1)`), the state is set to `_ST_ST_SUSPENDED` instead — no sleep queue entry, the coroutine hangs indefinitely until explicitly interrupted.
+
+3. **Suspend.** `_st_switch_context(me)` saves the coroutine's CPU registers via `_st_md_cxt_save(me->context)` (returns 0 on save), then calls `_st_vp_schedule(me)` which picks the next runnable coroutine from `run_q` — or the idle thread if nothing else is ready — and switches to it via `_st_restore_context`.
+
+4. **The coroutine is now frozen.** Its registers are saved in `me->context`, and it sits in the sleep heap with a computed deadline. It is **not** in `io_q` and has no epoll registration — this is the key difference from I/O wait.
+
+5. **The idle thread wakes it.** The idle thread's `epoll_wait` uses the sleep heap root's deadline as its timeout. When `epoll_wait` returns (either from I/O on other fds or timeout expiration), `_st_vp_check_clock()` runs: it reads `now = st_utime()`, updates `last_clock`, then walks the sleep heap — any thread with `due <= now` is removed from the heap, set to `_ST_ST_RUNNABLE`, and inserted at the **head** of `run_q` (timed-out coroutines get priority over I/O-ready ones).
+
+6. **Resume.** When the scheduler switches back, `_st_md_cxt_restore(me->context, 1)` restores registers and returns 1 (non-zero), so the `if` in `_st_switch_context` is skipped and execution continues after the `_st_switch_context()` call in `st_usleep()`. A final interrupt check is performed, then `return 0` — sleep complete.
+
+**Comparison with I/O wait path:**
+
+- `st_usleep` uses **only the sleep heap** — no `io_q`, no `epoll_ctl`, no epoll registration. The coroutine is woken exclusively by `_st_vp_check_clock()`.
+- `st_read`/`st_write` (EAGAIN path) uses **both `io_q` and the sleep heap** — epoll watches the fd, and the sleep heap provides timeout fallback. The coroutine can be woken by either `_st_epoll_dispatch()` (I/O ready) or `_st_vp_check_clock()` (timeout), distinguished by the `on_ioq` flag.
+- In both cases, `epoll_wait`'s timeout is derived from the sleep heap root, so pure-sleep coroutines still influence when `epoll_wait` returns.
+
+**Timeout precision note:** The deadline is `last_clock + usecs`, not `now + usecs`. If CPU work happened since the last scheduler cycle (the last time `_st_vp_check_clock` updated `last_clock`), part of the sleep duration is already "consumed." For typical sleep durations (seconds), this staleness is negligible. This is why ST timeouts are designed for coarse-grained use — detecting broken connections or idle peers, not sub-millisecond timing.
+
+**Related test cases (`st_utest_learn_kb.cpp`):**
+- `TEST(LearnKB, ThreadInterruptWakeupFromUsleep)`
+- `TEST(LearnKB, LocalStatePreservedAcrossYield)`
+- `TEST(LearnKB, StartRoutineNotExecutedInline)`
+
+## `st_mutex` — Cooperative Mutex Workflow
+
+ST's mutex is simple because cooperative scheduling eliminates the need for atomic operations, spinlocks, or memory barriers — just pointer manipulation and coroutine switching.
+
+**The struct (common.h):**
+- `_st_thread_t *owner` — current mutex owner, NULL means unlocked
+- `_st_clist_t wait_q` — linked list of coroutines waiting to acquire the mutex
+
+**`st_mutex_lock()` (sync.c):**
+
+1. **Check interrupt flag.** If `_ST_FL_INTERRUPT` is set on the current thread, clear it and return `EINTR` immediately — the coroutine was interrupted before it even attempted to acquire the lock. (Same pattern as `st_usleep` and `st_cond_timedwait`.)
+
+2. **Uncontended (owner == NULL).** Set `lock->owner = me`, return 0. Instant — just a pointer comparison and assignment, the cheapest possible lock. Safe because cooperative scheduling guarantees no other coroutine runs between the check and the assignment.
+
+3. **Same owner (owner == me).** Return `EDEADLK`. Deadlock detection — if you try to lock a mutex you already own, ST catches it immediately instead of hanging forever.
+
+4. **Contended (owner == someone else).** The coroutine must wait:
+   - Set `me->state = _ST_ST_LOCK_WAIT`
+   - Insert `me` into `lock->wait_q` (FIFO — insert before tail via `st_clist_insert_before`)
+   - Call `_st_switch_context(me)` — save registers, yield to scheduler
+   - The coroutine is now frozen: sitting on `lock->wait_q`, **not** on `sleep_q` (no timeout), **not** on `io_q` (no I/O). It can only be woken by `st_mutex_unlock()` or `st_thread_interrupt()`
+   - When resumed: remove self from `wait_q`, check if interrupted (return `EINTR` if interrupted and not the owner), otherwise return 0 — the coroutine now owns the mutex
+
+**`st_mutex_unlock()` (sync.c):**
+
+First checks that the caller actually owns the mutex (returns `EPERM` if not). Then walks `lock->wait_q` looking for a thread in `_ST_ST_LOCK_WAIT` state:
+
+- **Waiter found:** Direct ownership transfer — sets `lock->owner = waiter` immediately, sets `waiter->state = _ST_ST_RUNNABLE`, inserts waiter into `run_q` (at tail, normal priority). The unlocker does **not** yield — it keeps running. The waiter resumes when the current coroutine eventually hits I/O or yields.
+- **No waiters:** Simply sets `lock->owner = NULL`.
+
+**Key design properties:**
+
+- **No timeout.** Unlike `st_usleep` or `st_read`, `st_mutex_lock` has no timeout parameter. A waiting coroutine is not placed on the sleep heap — it waits indefinitely until unlocked or interrupted. For timed locking semantics, use `st_cond_timedwait` instead.
+- **Direct ownership transfer.** When unlocking with waiters, `lock->owner = waiter` is set before the waiter even resumes. This prevents a third coroutine from grabbing the mutex between the unlock and the waiter's resumption. Safe because cooperative scheduling means no preemption between these operations.
+- **FIFO fairness.** Waiters are added to the tail of `wait_q`; `st_mutex_unlock` walks from the head. First to wait is first to acquire. No starvation.
+- **No spin, no atomic ops, no syscalls.** The uncontended path is a pointer comparison and assignment. The contended path suspends the coroutine entirely — no busy-waiting. All of this works because no other coroutine can run between a check and an assignment in cooperative scheduling.
+
+**Related test cases (`st_utest_learn_kb.cpp`):**
+- `TEST(LearnKB, MutexCooperativeWorkflow)`
+- `TEST(LearnKB, ThreadInterruptWakeupFromMutexWait)`
+
+## `st_cond` — Condition Variable Workflow
+
+ST's condition variable (`st_cond`) follows a similar pattern to `st_mutex` — `wait_q` linked list, `_st_switch_context` to suspend, wake by setting state to `RUNNABLE` — but solves a fundamentally different problem: **waiting for a condition/event to happen**, not exclusive resource ownership.
+
+**The struct (common.h):** Just a `wait_q` linked list — no `owner` field. Nobody "owns" a condition variable.
+
+**Comparison with `st_mutex`:**
+- **Purpose:** `st_mutex` = exclusive ownership of a resource. `st_cond` = wait for something to happen.
+- **Owner:** `st_mutex` has `lock->owner`. `st_cond` has no ownership concept.
+- **Timeout:** `st_mutex_lock` has no timeout — waits forever. `st_cond_timedwait` supports timeout via the sleep heap.
+- **Wake semantics:** `st_mutex_unlock` transfers ownership to ONE waiter. `st_cond_signal` wakes ONE waiter, `st_cond_broadcast` wakes ALL waiters — no ownership transfer.
+- **Wait state:** `st_mutex` uses `_ST_ST_LOCK_WAIT`. `st_cond` uses `_ST_ST_COND_WAIT`.
+
+**`st_cond_timedwait()` (sync.c):**
+
+1. **Check interrupt flag.** Return `EINTR` if `_ST_FL_INTERRUPT` is set.
+2. **Enter wait queue.** Set `me->state = _ST_ST_COND_WAIT`, insert into `cvar->wait_q` (FIFO).
+3. **Optionally enter sleep heap.** If `timeout != ST_UTIME_NO_TIMEOUT`, call `_st_add_sleep_q(me, timeout)`. This is the key difference from `st_mutex` — the coroutine lands in **two places simultaneously**: `cvar->wait_q` AND the sleep heap. It gets woken by whichever fires first.
+4. **Suspend.** `_st_switch_context(me)` saves registers and yields to the scheduler.
+5. **Resume.** Remove self from `wait_q`. Check `_ST_FL_TIMEDOUT` → return `ETIME`. Check `_ST_FL_INTERRUPT` → return `EINTR`. Otherwise return 0 (signaled successfully).
+
+**Dual-wakeup mechanism:** When both `wait_q` and sleep heap are active, the coroutine wakes from whichever fires first:
+- **Signal fires first** → `_st_cond_signal` removes the coroutine from the sleep heap (`_st_del_sleep_q`), sets it RUNNABLE. On resume, no `_ST_FL_TIMEDOUT` flag → returns 0.
+- **Timeout fires first** → `_st_vp_check_clock` finds the coroutine in `_ST_ST_COND_WAIT` state, sets the `_ST_FL_TIMEDOUT` flag, moves it to `run_q` (at head — timeout priority). On resume, flag is set → returns `ETIME`. The coroutine is still on `wait_q` but removes itself upon resumption.
+
+This is the same dual-wakeup pattern as the I/O path (`io_q` + sleep heap), but with `wait_q` instead of `io_q`.
+
+**`_st_cond_signal()` (sync.c):**
+
+Walks `cvar->wait_q` from head, for each thread in `_ST_ST_COND_WAIT` state: if the thread is on the sleep heap, removes it (`_st_del_sleep_q`); sets `thread->state = _ST_ST_RUNNABLE`; inserts into `run_q`. If not broadcast, breaks after the first waiter. If broadcast, continues to wake all waiters.
+
+**`st_cond_wait()` is simply `st_cond_timedwait(cvar, ST_UTIME_NO_TIMEOUT)`** — no sleep heap entry, waits indefinitely until signaled or interrupted.
+
+**Why SRS uses `st_cond` far more than `st_mutex`:** In a cooperative coroutine system, there is no preemption — no other coroutine runs between a check and an assignment, so mutual exclusion is rarely needed. What SRS needs constantly is "wait until something happens" — data arrives on an SRT socket, a stream becomes available, a client connects. `st_cond` is the primary tool for this. The coroutine-native SRT pattern is a perfect example: a coroutine calls `st_cond_wait` when `srt_recvmsg` returns EAGAIN, and a poller coroutine calls `st_cond_signal` when the fd becomes ready.
+
+**Related test cases (`st_utest_learn_kb.cpp`):**
+- `TEST(LearnKB, CondSignalWakeOne)`
+- `TEST(LearnKB, CondBroadcastWakeAll)`
+- `TEST(LearnKB, CondTimedwaitTimeout)`
+- `TEST(LearnKB, ThreadInterruptWakeupFromCondWait)`
+
+## `st_thread_exit()` — How a Coroutine Dies
+
+`st_thread_exit(retval)` is called when a coroutine finishes — either explicitly by the user or implicitly via `_st_thread_main()` after the start function returns. It handles cleanup, joinability, and stack recycling. The coroutine never returns from this function.
+
+**Step 1: Store return value and run destructors.** Sets `thread->retval = retval`, then calls `_st_thread_cleanup(thread)` which iterates all created thread-specific data keys (up to `key_max`, not `ST_KEYS_MAX`). For each key that has both a non-NULL value and a registered destructor function, it calls the destructor and clears the slot. This is ST's equivalent of pthread key destructors.
+
+**Step 2: Decrement active count.** `_st_active_count--` — one fewer active coroutine. When this reaches 0, the idle thread will call `exit(0)` and the process terminates.
+
+**Step 3: Handle joinable threads (the zombie path).** If `thread->term` is non-NULL (thread was created with `joinable = 1`):
+
+1. Set `thread->state = _ST_ST_ZOMBIE` — the thread is dead but its resources are preserved for the joiner to inspect.
+2. Insert into `zombie_q` — this keeps the thread struct alive so `st_thread_join()` can read `retval`.
+3. `st_cond_signal(thread->term)` — wake any coroutine blocked in `st_thread_join()` waiting on this thread's termination condition variable.
+4. `_st_switch_context(thread)` — the zombie suspends. It will only be rescheduled by `st_thread_join()` after the joiner has read the return value.
+5. When rescheduled (by the joiner): destroy the termination condvar (`st_cond_destroy(thread->term)`), set `thread->term = NULL`.
+
+**Step 4: Free the stack.** If the thread is not the primordial thread, `_st_stack_free(thread->stack)` puts the stack on the free list. (Valgrind deregistration happens just before this if `MD_VALGRIND` is enabled.) The primordial thread's stack is the process stack — it's never freed.
+
+**Step 5: Final switch — no return.** `_st_switch_context(thread)` is called one last time. The scheduler picks the next runnable coroutine. Since the exiting thread is not on any queue (not in `run_q`, not in `zombie_q` anymore for non-joinable threads), it will never be scheduled again. Its stack has been freed (or put on the free list), and execution never returns here.
+
+**Why zombies need two context switches:** The first `_st_switch_context` (step 3) suspends the zombie so the joiner can run and read `retval`. The joiner then puts the zombie back on `run_q` (see `st_thread_join` below). The second `_st_switch_context` (step 5) is the final one — after the joiner has extracted what it needs, the zombie resumes briefly to destroy its condvar and free its stack, then switches away forever.
+
+**Non-joinable threads skip the zombie path entirely** — no condvar signal, no zombie queue, no waiting for a joiner. They go straight from cleanup → stack free → final switch.
+
+**Related test cases (`st_utest_learn_kb.cpp`):**
+- `TEST(LearnKB, ThreadExitExplicitRetvalThroughJoin)`
+- `TEST(LearnKB, ThreadExitNonJoinableCannotJoin)`
+- `TEST(LearnKB, ReturnValueThroughJoin)`
+
+## `st_thread_join()` — Waiting for a Coroutine to Finish
+
+`st_thread_join(thread, retvalp)` blocks the calling coroutine until the target thread exits. It's the mechanism for "wait for this coroutine to complete and get its result."
+
+**Precondition checks:**
+
+1. `thread->term == NULL` → thread is not joinable (created with `joinable = 0`). Returns `EINVAL`.
+2. `_st_this_thread == thread` → trying to join yourself. Returns `EDEADLK`.
+3. `term->wait_q` is non-empty → another coroutine is already waiting to join this thread. Returns `EINVAL`. Only one joiner is allowed per joinable thread.
+
+**The wait loop:**
+
+```c
+while (thread->state != _ST_ST_ZOMBIE) {
+    if (st_cond_timedwait(term, ST_UTIME_NO_TIMEOUT) != 0)
+        return -1;
+}
+```
+
+The joiner calls `st_cond_timedwait` on the target's termination condvar with no timeout — it suspends indefinitely. When `st_thread_exit()` signals this condvar, the joiner wakes up and checks if the target is in `_ST_ST_ZOMBIE` state. If yes, the loop exits. If not (for example, a spurious wakeup), it waits again. If interrupted via `st_thread_interrupt`, `st_cond_timedwait` returns an error and `st_thread_join()` returns `-1` immediately.
+
+**After the target is zombie:**
+
+1. Read the return value: `*retvalp = thread->retval` (if `retvalp` is non-NULL).
+2. **Reschedule the zombie for final cleanup:** Remove the zombie from `zombie_q`, set its state to `_ST_ST_RUNNABLE`, insert into `run_q`. This lets the zombie resume in `st_thread_exit()` to destroy its condvar and free its stack (the second `_st_switch_context` in `st_thread_exit`).
+
+**Why the joiner reschedules the zombie instead of cleaning up directly:** The zombie's stack contains the `_st_thread_t` struct itself (thread metadata is carved from the top of the stack — see `st_thread_create`). If the joiner freed the stack, it would destroy the thread struct it's still reading from. Instead, the joiner puts the zombie back on `run_q`, and the zombie cleans up its own resources when it gets scheduled — running on its own stack, which is safe to free at the very end (the final `_st_switch_context` switches away before the stack memory is actually reused).
+
+**The complete lifecycle of a joinable thread:**
+
+1. `st_thread_create(start, arg, joinable=1, ...)` — born, placed on `run_q`
+2. Scheduled → runs `_st_thread_main()` → runs `start(arg)`
+3. `start` returns → `st_thread_exit(retval)` called
+4. Cleanup runs, state → `_ST_ST_ZOMBIE`, inserted into `zombie_q`
+5. `st_cond_signal(term)` wakes the joiner
+6. First `_st_switch_context` — zombie suspends
+7. Joiner wakes, reads `retval`, moves zombie from `zombie_q` to `run_q`
+8. Zombie resumes, destroys condvar, frees stack
+9. Final `_st_switch_context` — zombie is gone forever
+
+**Related test cases (`st_utest_learn_kb.cpp`):**
+- `TEST(LearnKB, ReturnValueThroughJoin)`
+- `TEST(LearnKB, JoinDrivesFirstRunWhenNoManualYield)`
+- `TEST(LearnKB, ThreadExitExplicitRetvalThroughJoin)`
+- `TEST(LearnKB, ThreadExitNonJoinableCannotJoin)`
+- `TEST(LearnKB, StartRoutineNotExecutedInline)`
+
+## `st_thread_interrupt()` — Waking a Coroutine From Any Wait State
+
+`st_thread_interrupt(thread)` is the "cancel" mechanism — it forces a coroutine to wake up regardless of what it's waiting on (I/O, sleep, condvar, mutex). The interrupted coroutine sees `EINTR` when it resumes.
+
+**The function (sched.c):**
+
+1. **Dead thread check.** If `thread->state == _ST_ST_ZOMBIE`, return immediately — can't interrupt a dead thread.
+
+2. **Set the interrupt flag.** `thread->flags |= _ST_FL_INTERRUPT`. This flag persists until the interrupted coroutine checks and clears it upon resumption.
+
+3. **Already running/runnable check.** If the thread is `_ST_ST_RUNNING` or `_ST_ST_RUNNABLE`, just return — the flag is set, and the thread will see it next time it enters an I/O or wait function. No queue manipulation needed.
+
+4. **Remove from sleep heap (if present).** If `_ST_FL_ON_SLEEPQ` is set, call `_st_del_sleep_q(thread)` to remove it from the timeout heap. This is necessary because the thread is being woken prematurely — its timeout is no longer relevant.
+
+5. **Make runnable.** Set `thread->state = _ST_ST_RUNNABLE`, insert into `run_q` (at tail via `st_clist_insert_before`).
+
+**What the function does NOT do:** It does not remove the thread from `io_q` or `cvar->wait_q`. This is handled by the interrupted coroutine itself when it resumes — the same pattern as timeout wakeups in `st_poll()` (checks `pq.on_ioq`), `st_cond_timedwait()` (removes self from `wait_q`), and `st_mutex_lock()` (removes self from `wait_q`).
+
+**How each wait function detects interruption:**
+
+- **`st_poll()` (I/O wait):** After resuming from `_st_switch_context`, checks `me->flags & _ST_FL_INTERRUPT`. If set, clears the flag, sets `errno = EINTR`, returns -1. The `pq.on_ioq` flag is still 1 (dispatch didn't wake us), so the poll entry is also cleaned up from `io_q` and epoll.
+
+- **`st_usleep()` (sleep):** Checks the interrupt flag both before sleeping and after resuming. If set, clears it, returns `EINTR`.
+
+- **`st_cond_timedwait()` (condvar wait):** After resuming, removes self from `wait_q`, then checks `_ST_FL_INTERRUPT`. If set, clears it, returns `EINTR`.
+
+- **`st_mutex_lock()` (mutex wait):** After resuming, removes self from `wait_q`. If the interrupt flag is set AND the thread is not the mutex owner (another thread could have unlocked the mutex at the same time as the interrupt), returns `EINTR`.
+
+**The interrupt flag is "sticky" until consumed:** Once set, it stays set until a blocking path checks and clears it. If the thread is already running, the flag has no immediate effect. At the next wait point with explicit interrupt checks (`st_poll`, `st_usleep`, `st_cond_timedwait`, `st_mutex_lock`), the call returns `EINTR` instead of remaining blocked. For I/O wrappers like `st_read`/`st_write`, interruption is observed when they enter `st_poll` (typically after `EAGAIN`); if the syscall succeeds immediately, they may return data instead of `EINTR`. This design prevents races where interrupt arrives between deciding to wait and actually suspending.
+
+**Interrupt vs. the sleep heap:** When a thread is on both `io_q`/`wait_q` and the sleep heap (e.g., `st_cond_timedwait` with timeout), `st_thread_interrupt` only removes it from the sleep heap. The thread remains on `io_q`/`wait_q` until it resumes and cleans up. This is safe because the cleanup is always done by the thread itself after waking.
+
+**Use in SRS:** `st_thread_interrupt` is how SRS implements graceful shutdown. When SRS needs to stop (e.g., SIGINT), it interrupts all active coroutines. Each coroutine's I/O call returns `EINTR`, the coroutine sees the shutdown flag, and exits cleanly. Without this mechanism, coroutines blocked on I/O would never wake up to check if the server is shutting down.
+
+**Critical lifecycle rule (do not assume immediate termination):** `st_thread_interrupt()` does **not** terminate the target coroutine synchronously. It only marks/wakes the coroutine so its current blocking call can return (typically `-1` with `errno=EINTR`, for example `st_read`). The coroutine must then cooperatively unwind and return from its entry function. Therefore, the interrupter should use a join/synchronization step (for joinable threads, `st_thread_join`) to wait for actual thread exit, instead of assuming interrupt == already dead.
+
+**Related test cases (`st_utest_learn_kb.cpp`):**
+- `TEST(LearnKB, ThreadInterruptWakeupFromUsleep)`
+- `TEST(LearnKB, ThreadInterruptWakeupFromCondWait)`
+- `TEST(LearnKB, ThreadInterruptWakeupFromMutexWait)`
+
+## The Netfd Abstraction (`_st_netfd_t`) — ST's File Descriptor Wrapper
+
+Most socket/file descriptors used with ST's convenience I/O APIs are wrapped in a `_st_netfd_t`. This struct is the bridge between raw OS file descriptors and ST's coroutine I/O system. Most ST I/O helpers (`st_read`, `st_write`, `st_accept`, `st_connect`, etc.) take `_st_netfd_t*` instead of raw `int` fds, while the lower-level `st_poll()` API still accepts raw `struct pollfd` descriptors.
+
+**The struct (common.h):**
+
+```c
+typedef struct _st_netfd {
+    int osfd;                    /* Underlying OS file descriptor */
+    int inuse;                   /* In-use flag */
+    void *private_data;          /* Per-descriptor private data */
+    _st_destructor_t destructor; /* Private data destructor function */
+    void *aux_data;              /* Auxiliary data for internal use */
+    struct _st_netfd *next;      /* For putting on the free list */
+} _st_netfd_t;
+```
+
+- `osfd` — The actual OS file descriptor number (what you'd pass to `read(2)`, `write(2)`)
+- `inuse` — Whether this wrapper is active (1) or recycled on the free list (0)
+- `private_data` + `destructor` — Per-fd user data with cleanup callback, set via `st_netfd_setspecific()`. SRS uses this to attach application-level connection objects to the fd
+- `aux_data` — Reserved for internal use (currently a no-op; historically used for accept serialization in multi-process setups)
+- `next` — Singly-linked list pointer for the free list
+
+**The Free List — Object Recycling:**
+
+`_st_netfd_t` objects are recycled via a thread-local singly-linked free list (`_st_netfd_freelist`). When a netfd is freed (`st_netfd_free`), it's pushed onto the free list. When a new netfd is needed (`_st_netfd_new`), the free list is checked first — if non-empty, a recycled object is popped; otherwise a fresh one is `calloc`'d. This avoids malloc/free churn for the most frequently created/destroyed objects in a server (one per connection).
+
+The free list is `static __thread` — each pthread in a multi-threaded ST setup has its own free list, requiring no locking. Note: the earlier documentation mentioned the netfd freelist having a pthread mutex as the only shared-state lock. Looking at the current code, the freelist is actually thread-local (`static __thread _st_netfd_t *_st_netfd_freelist`), so no mutex is needed. The mutex for shared netfd state existed in older versions of the toffaletti multi-threading fork but the current ossrs/state-threads code uses `__thread` isolation instead.
+
+**Creating a Netfd — `_st_netfd_new(osfd, nonblock, is_socket)`:**
+
+This is the internal constructor called by all public creation functions:
+
+1. **Notify the event system:** Calls `_st_eventsys->fd_new(osfd)`. For epoll, this is `_st_epoll_fd_new` which ensures the per-fd data array (`fd_data`) is large enough to hold the fd's index — expanding it via `realloc` if needed. For kqueue, this does the same with its own `fd_data` array. This is how ST tracks per-fd reference counts and revents.
+2. **Get or allocate the wrapper:** Pop from `_st_netfd_freelist` if available, otherwise `calloc` a new one.
+3. **Set non-blocking mode:** If `nonblock` is true (for example in `st_netfd_open_socket`, and in `st_accept` on platforms where accepted sockets do not inherit non-blocking mode), sets `O_NONBLOCK`. For sockets, tries `ioctl(FIONBIO)` first (one syscall) — if that fails, falls back to `fcntl(F_GETFL)` + `fcntl(F_SETFL, O_NONBLOCK)` (two syscalls). Non-blocking mode is essential — without it, `read`/`write` would block the entire process instead of returning `EAGAIN` for the coroutine to handle.
+4. **Return the wrapper** with `osfd` set and `inuse = 1`.
+
+**Public creation functions:**
+
+- `st_netfd_open(osfd)` — Wrap any fd, set non-blocking. Used for pipes, FIFOs, etc. Calls `_st_netfd_new(osfd, 1, 0)` — `is_socket=0` means it skips the `ioctl(FIONBIO)` shortcut.
+- `st_netfd_open_socket(osfd)` — Wrap a socket fd, set non-blocking. Calls `_st_netfd_new(osfd, 1, 1)` — `is_socket=1` enables the faster `ioctl` path.
+- `st_open(path, oflags, mode)` — Open a file/FIFO with `O_NONBLOCK` added to `oflags`, then wrap it. The fd is already non-blocking from the `open()` call, so `_st_netfd_new` is called with `nonblock=0` (no need to set it again).
+
+**Closing a Netfd — `st_netfd_close(fd)`:**
+
+1. **Notify the event system:** Calls `_st_eventsys->fd_close(fd->osfd)`. For epoll, `_st_epoll_fd_close` checks that no coroutines are still watching this fd (all reference counts must be zero) — if any are non-zero, it returns `EBUSY` and the close fails. This prevents closing an fd that other coroutines are waiting on.
+2. **Recycle the wrapper:** Calls `st_netfd_free(fd)` which clears `inuse`, calls the private data destructor if set, and pushes the wrapper onto the free list.
+3. **Close the OS fd:** `close(fd->osfd)`.
+
+**Freeing without closing — `st_netfd_free(fd)`:**
+
+Sometimes you want to release the ST wrapper without closing the underlying OS fd (e.g., if another library owns the fd lifecycle). `st_netfd_free` does this: clears `aux_data`, calls the private data destructor, sets `inuse = 0`, and pushes to the free list. The OS fd remains open.
+
+**The Per-Fd Data Array in the Event System:**
+
+The event system maintains an array indexed by OS fd number. Initial sizing is backend-specific (epoll starts from `fd_hint`, derived from fd limits and `ST_EPOLL_EVTLIST_SIZE`; kqueue starts from `FD_SETSIZE`), and then grows dynamically via `realloc` as larger fd values appear. For epoll, this is `_st_epoll_data->fd_data` — an array of `_epoll_fd_data_t`:
+
+```c
+typedef struct _epoll_fd_data {
+    int rd_ref_cnt;   /* Number of coroutines waiting for read */
+    int wr_ref_cnt;   /* Number of coroutines waiting for write */
+    int ex_ref_cnt;   /* Number of coroutines waiting for exception */
+    int revents;      /* Fired events from last epoll_wait */
+} _epoll_fd_data_t;
+```
+
+This is **not** inside `_st_netfd_t` — it's a separate array in the event system, indexed by raw fd number. The reference counts track how many coroutines are watching each direction on each fd. When `st_poll()` registers a coroutine via `_st_epoll_pollset_add`, it increments the appropriate counts and calls `epoll_ctl(EPOLL_CTL_ADD)` or `epoll_ctl(EPOLL_CTL_MOD)`. During wakeup/timeout cleanup, `_st_epoll_pollset_del` decrements counts and updates epoll only for descriptors that did **not** fire in the current dispatch pass; fired descriptors are cleaned up in the dispatch function's second pass.
+
+This separation (netfd wrapper vs. event system per-fd data) is a clean design: the netfd is the application-facing handle, while the per-fd data is the event system's internal bookkeeping. Multiple netfd wrappers could theoretically point to the same osfd (though this would be unusual), and the event system tracks watchers by raw fd number regardless.
+
+**I/O Initialization — `_st_io_init()`:**
+
+Called once during `st_init()`, this function does two things:
+
+1. **Ignore SIGPIPE.** Sets `SIGPIPE` handler to `SIG_IGN` via `sigaction`. Without this, writing to a closed socket would kill the process. With it, `write()` just returns `EPIPE` which ST handles normally.
+2. **Maximize fd limit.** Reads `RLIMIT_NOFILE` via `getrlimit`, raises `rlim_cur` to `rlim_max` via `setrlimit`, and stores the result in `_st_osfd_limit`. On macOS where `rlim_max` can be negative (a platform quirk), it falls back to the event system's limit or `rlim_cur`. This fd limit influences backend initial sizing, but per-fd arrays are still expanded dynamically as needed.
+
+**Per-Fd Private Data — `st_netfd_setspecific` / `st_netfd_getspecific`:**
+
+These let application code attach arbitrary data to a netfd, similar to pthread key-specific data but per-fd instead of per-thread. `st_netfd_setspecific(fd, value, destructor)` stores a pointer and a cleanup function; when the netfd is freed, the destructor is called automatically. SRS uses this to associate connection handler objects with their socket fds.
+
+**Why most ST network I/O helpers take `_st_netfd_t*` (with `st_poll` as the raw-fd exception):**
+
+The netfd wrapper ensures three invariants: (1) the fd is always non-blocking, (2) the event system knows about the fd and can track watchers, and (3) the fd can carry application-specific data. Without the wrapper, application code could accidentally use a blocking fd with ST's helper I/O functions, bypassing the coroutine scheduler and stalling the entire process. For APIs that require `_st_netfd_t*`, this is a compile-time guard against passing raw `int` fds; when direct raw-fd polling is needed, ST exposes `st_poll()` for that purpose.
+
+**Related test cases (`st_utest_learn_kb.cpp`):**
+- `TEST(LearnKB, NetfdSpecificAndDestructorOnClose)`
+- `TEST(LearnKB, NetfdFreeKeepsOsfdOpen)`
+- `TEST(LearnKB, BasicNetfdWriteThenRead)`
+- `TEST(LearnKB, BasicNetfdReadTimeout)`
+
+## The Event System Abstraction (`_st_eventsys_t`) — How ST Swaps I/O Backends
+
+ST uses a vtable pattern — a struct of function pointers — so the scheduler can call I/O multiplexing operations without knowing which backend (epoll, kqueue, or select) is active.
+
+**The vtable struct (common.h):**
+
+```c
+typedef struct _st_eventsys_ops {
+    const char *name;                          /* "select", "kqueue", "epoll" */
+    int  val;                                  /* ST_EVENTSYS_SELECT or ST_EVENTSYS_ALT */
+    int  (*init)(void);                        /* Create the OS multiplexer */
+    void (*dispatch)(void);                    /* The blocking wait + wake coroutines loop */
+    int  (*pollset_add)(struct pollfd *, int); /* Register fds when a coroutine starts waiting */
+    void (*pollset_del)(struct pollfd *, int); /* Unregister fds when I/O completes or times out */
+    int  (*fd_new)(int);                       /* Notify backend about a new fd (expand arrays) */
+    int  (*fd_close)(int);                     /* Check fd can be closed (no active watchers) */
+    int  (*fd_getlimit)(void);                 /* Hard fd limit (FD_SETSIZE for select, 0=unlimited) */
+    void (*destroy)(void);                     /* Tear down the event system */
+} _st_eventsys_t;
+```
+
+The global pointer `__thread _st_eventsys_t *_st_eventsys` is thread-local — each pthread in a multi-threaded ST setup gets its own event system instance. Each backend defines a static instance of this struct with its functions filled in (e.g., `_st_epoll_eventsys`, `_st_kq_eventsys`, `_st_select_eventsys`), and `st_set_eventsys()` points the global at the chosen one.
+
+**The pollfd bridge:** All three backends speak `struct pollfd` at the interface level — `pollset_add` and `pollset_del` take `struct pollfd*`. This is the abstraction layer. The scheduler only knows about `POLLIN`/`POLLOUT`/`POLLPRI`. Each backend translates these to its native API internally (epoll events, kqueue filters, or fd_sets).
+
+**How the scheduler uses the vtable:** The scheduler never calls `epoll_wait` or `kevent` or `select` directly. Every call goes through `_st_eventsys->`:
+
+- `st_init()` → `_st_eventsys->init()` — creates epoll fd / kqueue fd / initializes fd_sets
+- Idle thread loop → `_st_eventsys->dispatch()` — the big blocking wait
+- `st_poll()` → `_st_eventsys->pollset_add(pds, npds)` — when a coroutine suspends on I/O
+- I/O completion / timeout → `_st_eventsys->pollset_del(pds, npds)` — cleanup registrations
+- `_st_netfd_new()` → `_st_eventsys->fd_new(osfd)` — expand per-fd arrays if needed
+- `st_netfd_close()` → `_st_eventsys->fd_close(osfd)` — check no active watchers before close
+
+**Three backends, compile-time selected:**
+
+- **select** — fallback backend (used on Cygwin64 and also compiled on Darwin/Linux). Compile flag `MD_HAVE_SELECT`. Val `ST_EVENTSYS_SELECT` (1). Hard limit of `FD_SETSIZE` fds. State stored in `__thread` `fd_set`s with per-fd reference count arrays.
+- **kqueue** — macOS/Darwin. Compile flag `MD_HAVE_KQUEUE`. Val `ST_EVENTSYS_ALT` (3). No fd limit. State in `__thread` struct with per-fd data array, plus add/delete kevent lists.
+- **epoll** — Linux. Compile flag `MD_HAVE_EPOLL`. Val `ST_EVENTSYS_ALT` (3). No fd limit. State in `__thread` struct with per-fd data array (read/write/exception reference counts + revents).
+
+Kqueue and epoll both use `ST_EVENTSYS_ALT` — they're interchangeable "advanced" backends. Select is the fallback. The Makefile defines which flags are set per platform: Darwin gets `MD_HAVE_KQUEUE` + `MD_HAVE_SELECT`, Linux gets `MD_HAVE_EPOLL` + `MD_HAVE_SELECT`, Cygwin64 gets only `MD_HAVE_SELECT`. If none of the three are defined, compilation fails with `#error`.
+
+**Backend selection (`st_set_eventsys`):**
+
+`st_init()` calls `st_set_eventsys(ST_EVENTSYS_DEFAULT)`, which maps to select (it resolves to `ST_EVENTSYS_SELECT` internally). To get epoll or kqueue, application code must call `st_set_eventsys(ST_EVENTSYS_ALT)` **before** `st_init()`. For `ST_EVENTSYS_ALT`: if `MD_HAVE_KQUEUE` is defined, kqueue is used; else if `MD_HAVE_EPOLL` is defined and `_st_epoll_is_supported()` confirms the kernel actually supports it (by probing `epoll_ctl` for `ENOSYS`), epoll is used. Once set, the pointer is locked — calling `st_set_eventsys` again returns `EBUSY`. SRS calls `st_set_eventsys(ST_EVENTSYS_ALT)` to get epoll on Linux and kqueue on macOS.
+
+**The uniform dispatch pattern:** Despite different OS APIs, all three `dispatch` functions follow the same structure:
+
+1. Calculate timeout from sleep heap root (`sleep_q->due - last_clock`)
+2. Call the OS blocking function (`select` / `kevent` / `epoll_wait`)
+3. Mark fired fds in per-fd state
+4. Walk `io_q` — for each waiting coroutine, check its fds against fired results
+5. If any fd matched → remove from `io_q` (`on_ioq = 0`), call `pollset_del` to clean up registrations, remove from sleep heap if present, set `_ST_ST_RUNNABLE`, insert into `run_q`
+6. Clean up OS-level state for fired fds
+
+The scheduler, idle thread, coroutine suspension/resumption — all identical regardless of backend. Only the vtable functions differ.
+
+**Backend-specific details:**
+
+*Select:*
+- Maintains three `__thread` `fd_set`s (read/write/exception) with per-fd reference counts. `dispatch` must copy all three fd_sets before calling `select()` because select modifies them in place.
+- Has a `_st_select_find_bad_fd()` recovery handler — when `select` returns `EBADF`, it walks all waiting fds with `fcntl(F_GETFL)` to identify and remove the bad one.
+- The `maxfd` tracker is maintained across add/del/dispatch — select requires the highest fd number + 1 as its first argument.
+- The select backend data is `__thread` (`_st_select_data`), so each pthread keeps independent select bookkeeping.
+
+*Kqueue:*
+- Uses `EV_ONESHOT` flag — each registration fires once and auto-deregisters from kqueue. Elegant: no need to explicitly delete fired fds. Only unfired fds need explicit `EV_DELETE`.
+- Batches additions via an `addlist` — `pollset_add` queues `struct kevent` entries, and `dispatch` submits them all in a single `kevent()` call via the changelist parameter. This reduces syscalls.
+- Deletions are synchronous — `pollset_del` calls `kevent()` immediately with a `dellist` to avoid stale fd problems (can't defer because the fd might be closed before the next dispatch).
+- Handles **fork recovery** — if `getpid()` changes after `kevent` returns `EBADF`, it re-creates the kqueue fd and re-registers all fds from `io_q`. Kqueue fds don't survive `fork()`.
+- Timeout uses `struct timespec` (nanosecond precision).
+- `destroy` exists in the vtable, but current kqueue backend cleanup is a TODO (`_st_kq_destroy` is not implemented yet).
+
+*Epoll:*
+- Level-triggered (no `EPOLLET` flag) — simplest model, events re-fire on every `epoll_wait` until the fd is consumed or removed.
+- Uses reference counting (`rd_ref_cnt`, `wr_ref_cnt`, `ex_ref_cnt`) per fd to support multiple coroutines watching the same fd. `epoll_ctl` is called to `ADD`/`MOD`/`DEL` as reference counts transition between 0 and non-zero.
+- `dispatch` has two cleanup passes for fired fds: the first pass (inside the `io_q` walk) calls `pollset_del` which handles unfired fds — but skips fired fds because their `_ST_EPOLL_REVENTS` is still set. The second pass (after the `io_q` walk) iterates the epoll result list, clears revents, and calls `EPOLL_CTL_MOD` or `EPOLL_CTL_DEL` based on remaining reference counts. This two-pass design avoids modifying epoll state while still iterating results that depend on it.
+- Timeout uses milliseconds (epoll_wait limitation). Rounds up sub-millisecond timeouts to 1ms to avoid spin loops — if `min_timeout > 0` but computes to 0ms, it's bumped to 1ms.
+- `_st_epoll_is_supported()` probes at selection time by calling `epoll_ctl(-1, EPOLL_CTL_ADD, -1, &ev)` — if errno is `ENOSYS`, epoll syscalls are stubs and the backend is rejected.
+
+**Related test cases (`st_utest_learn_kb.cpp`):**
+- `TEST(LearnKB, EventSysSelectedAndLockedAfterInit)`
+- `TEST(LearnKB, BasicNetfdReadTimeout)`
+- `TEST(LearnKB, BasicNetfdWriteThenRead)`
diff --git a/openclaw/skills/kb-review/SKILL.md b/openclaw/skills/kb-review/SKILL.md
new file mode 100644
index 000000000..c3174f6f6
--- /dev/null
+++ b/openclaw/skills/kb-review/SKILL.md
@@ -0,0 +1,68 @@
+---
+name: kb-review
+description: Review and correct SRS knowledge base documents (memory/srs-*.md) by loading relevant source code into context and identifying inaccuracies. Use when asked to review, correct, verify, or check the knowledge base, documentation accuracy, or when someone wants to find issues in srs-overview.md or srs-coroutines.md sections.
+---
+
+# KB Review — Knowledge Base Accuracy Checker
+
+Review SRS knowledge base documents against the actual codebase to find inaccuracies.
+
+## Setup
+
+Do **not** hardcode an absolute SRS path. Resolve `SRS_ROOT` dynamically:
+
+1. If `SRS_ROOT` env is set and contains `trunk/src`, use it.
+2. Else, if current workspace (or its git root) contains `trunk/src`, use that.
+3. Else, if `~/git/srs/trunk/src` exists, use `~/git/srs`.
+4. Else, ask the user for the SRS repo root.
+
+All paths below are relative to `$SRS_ROOT`.
+
+**Key paths:**
+- Knowledge base: `memory/srs-*.md` (in the workspace/openclaw dir)
+- SRS source: `trunk/src/` (subdirs: app, core, kernel, protocol, utest, main)
+- ST library: `trunk/3rdparty/st-srs/`
+- Config: `trunk/conf/full.conf`
+
+## Workflow
+
+**Step 1: Identify target document and section**
+
+List files matching `memory/srs-*.md` and present them to the user. Ask the user which document and which section to review — let the user type the section name freely (do not list sections for them to pick from).
+
+**Step 2: Read the document section**
+
+Read the chosen section text fully.
+
+**Step 3: Identify and load all relevant source code**
+
+Analyze the section content — every function name, struct, config directive, protocol, file, or mechanism mentioned. Then:
+
+1. Determine which part of the codebase the section covers
+2. Use the appropriate skill to load the code — e.g., `st-develop` skill for ST/coroutine code
+3. Follow that skill's loading instructions fully — do not skip files or read partially
+4. If no skill exists for the relevant codebase area, search and load the files directly
+
+The goal: have every piece of code the section describes loaded in context before reviewing.
+
+**Step 4: Review and report issues**
+
+Compare every claim in the document against the loaded source code. Check for:
+
+- **Factual errors** — Function names, struct names, variable names that don't match code
+- **Outdated info** — Behavior described that no longer matches current implementation
+- **Missing context** — Important details in the code not mentioned in the doc
+- **Wrong mechanics** — Incorrect description of how something works vs what the code actually does
+- **Version/date errors** — Wrong version numbers or dates (cross-check git tags if needed)
+- **Config errors** — Wrong config directive names, wrong default values
+
+Present findings as a numbered list. For each issue:
+1. Quote the problematic text from the doc
+2. Explain what the code actually shows
+3. Cite the specific file and line(s)
+
+If the section is accurate, say so — don't invent issues.
+
+**Step 5: Ask if user wants corrections applied**
+
+After presenting issues, ask if the user wants to apply fixes to the document. Only edit with explicit approval.
diff --git a/openclaw/skills/srs-learn/SKILL.md b/openclaw/skills/srs-learn/SKILL.md
new file mode 100644
index 000000000..1c912620e
--- /dev/null
+++ b/openclaw/skills/srs-learn/SKILL.md
@@ -0,0 +1,134 @@
+---
+name: srs-learn
+description: For developers who want to become SRS contributors or maintainers — learn SRS or any of its modules (ST, protocols, media) in depth, understand detailed code and implementation, media architecture, and the underlying knowledge behind it all. The learning path for anyone who wants to touch, modify, or extend the codebase.
+---
+
+# SRS Learn
+
+## Purpose
+Turn SRS knowledge base docs into hands-on learning sessions:
+- Start from `memory/srs-*.md`
+- Let the user choose what to learn
+- Teach with real source code
+- Default: create a new, standalone unit test file for the learner
+- If the user explicitly requests reusing/modifying a specific existing utest file, follow the user's request instead of forcing a new file
+- Build and run it successfully before moving on
+- Teach workflow and debugging until the topic is understood
+
+## Setup
+Before starting a learning session:
+- Resolve `SRS_ROOT` dynamically:
+  1. If `SRS_ROOT` env is set and contains `trunk/src`, use it.
+  2. Else, if current workspace (or its git root) contains `trunk/src`, use that.
+  3. Else, if `~/git/srs/trunk/src` exists, use `~/git/srs`.
+  4. Else, ask the user for SRS repo root.
+- Confirm knowledge base files exist in workspace: `memory/srs-*.md`.
+- Identify the matching specialized skill for the topic (e.g. `st-develop` for ST/coroutines). A specialized skill is **required** — if none exists, abort (see Step 3).
+
+## Learning Workflow
+Follow this sequence every time.
+
+1. Identify the target knowledge base.
+2. Summarize concrete sections and let the user choose.
+3. Find the matching specialized skill — abort if none exists.
+4. Teach the section with code + new utest file (build success + run success required).
+5. Explain utest workflow and debugging.
+6. Confirm mastery and propose the next step.
+
+Do not skip user choice steps.
+
+## Step 1: Select Knowledge Base
+List all matching files: `memory/srs-*.md`.
+
+Ask the user:
+- Which KB file to learn now?
+- What is the goal (overview, deep internals, debugging, implementation)?
+- Preferred depth (quick, normal, deep)?
+
+If the user already specifies the KB, proceed directly.
+
+## Step 2: Summarize Concrete Learning Sections
+Read the selected KB fully.
+
+Extract concrete, teachable sections and present them as a numbered menu. For each item, include:
+- Section/topic name
+- What the learner will master
+- Main source files/functions to inspect
+- A candidate utest demo idea
+
+Keep the menu concise and actionable (typically 3-8 items).
+
+Ask the user to select one item before continuing.
+
+## Step 3: Find Specialized Skill (Required)
+After the user picks a section, identify the matching specialized skill.
+
+srs-learn **cannot** create, build, or run utests on its own. It relies entirely on the specialized skill for:
+- Loading the correct source code context
+- Creating utest files in the right location with the right patterns
+- Building and running utests
+
+Example: for ST/coroutine topics, use `st-develop`.
+
+**If no matching specialized skill exists for the selected topic, abort the learning task.** Tell the user which topic/module lacks a skill and that one needs to be created before this topic can be learned through srs-learn.
+
+## Step 4: Teach with Code + New Unit Test
+Read and follow the specialized skill identified in Step 3. It owns the build/test workflow.
+
+After completing build/run for a lesson, always run the specialized skill's required verifier (if defined) before declaring completion.
+
+By default, create a **new, standalone utest file** so each lesson has a clean, isolated artifact to study. If the user explicitly asks to continue in a specific existing utest file, modify that file instead.
+
+Teach in this order:
+
+1. Explain the concept briefly from KB (what and why).
+2. Walk through the concrete code path (entry → core logic → output/effect).
+3. Create a new utest file that demonstrates one specific behavior from the section.
+4. Build the utest — confirm zero build errors.
+5. Run the utest — confirm it passes.
+6. Explain why the test passes based on code logic.
+
+Both **build success** and **run success** are required before the lesson is considered complete. If either fails, debug and fix, then retry. If blocked, explicitly report the blocker and current failure output.
+
+Unit test guidelines:
+- Keep the scope narrow (one behavior per test).
+- Use clear naming tied to the concept.
+- Prefer deterministic inputs and assertions.
+- Reuse existing utest patterns from the repository.
+
+## Step 5: Explain Utest Workflow
+After running the test, explicitly teach the workflow:
+- How test setup/fixtures map to module state
+- What action triggers the behavior
+- What assertions validate correctness
+- How this test connects to production code flow
+
+Then give a debugging walkthrough:
+- Where to set breakpoints/logs
+- Which variables/state transitions matter most
+- Common failure signatures
+- How to isolate regressions quickly
+
+## Step 6: Mastery Check and Iteration
+Ask short mastery-check questions:
+- "What does this test prove?"
+- "Which function is the true decision point?"
+- "If this assertion fails, where do we debug first?"
+
+If the user wants more practice:
+- Propose an extension exercise (new edge case or variation)
+- Create a new utest file for it (same rules: build success + run success), unless the user explicitly asks to continue in an existing utest file
+- Discuss results
+
+## Output Format During Sessions
+Use this response structure during learning sessions:
+
+1. Selected section and objective
+2. Code map (files/functions)
+3. Unit test plan
+4. Utest implementation + build/run results (must include test file path and explicit pass evidence)
+5. Workflow explanation
+6. Debugging checklist
+7. Mastery check + next step
+
+Keep explanations technical and direct. Prioritize concrete code behavior over abstract theory.
diff --git a/openclaw/skills/srs-support/SKILL.md b/openclaw/skills/srs-support/SKILL.md
index 65a174cf2..eeea1450d 100644
--- a/openclaw/skills/srs-support/SKILL.md
+++ b/openclaw/skills/srs-support/SKILL.md
@@ -9,29 +9,30 @@ Answer questions about SRS using the knowledge base in the SRS repository.
 
 ## Setup
 
-The user must have the SRS repository cloned locally. The knowledge files live in the `openclaw/` directory of the repo.
+The user must have the SRS repository cloned locally. The knowledge files live in the `openclaw/` directory inside the SRS repo.
 
-## Finding the Repository
+Do **not** hardcode an absolute SRS path. Resolve `SRS_ROOT` dynamically:
 
-The default and recommended path is `~/git/srs/`. Check there first for `openclaw/memory/srs-overview.md`.
+1. If `SRS_ROOT` env is set and contains `openclaw/memory/srs-overview.md`, use it.
+2. Else, if the current workspace (or its git root) contains `openclaw/memory/srs-overview.md`, use that as `SRS_ROOT`.
+3. Else, if `~/git/srs/openclaw/memory/srs-overview.md` exists, use `~/git/srs`.
+4. Else, ask the user for their SRS repo root (or to clone it).
 
-If not found, ask the user to either:
-1. Clone the SRS repo to `~/git/srs/` (recommended): `git clone https://github.com/ossrs/srs.git ~/git/srs`
-2. Tell you where their existing SRS repo is located
+All paths below are relative to `$SRS_ROOT`.
 
 ## Loading Knowledge
 
 On first question, load **all** `srs-*.md` files from `openclaw/memory/` into context:
 
 ```bash
-ls openclaw/memory/srs-*.md
+ls "$SRS_ROOT"/openclaw/memory/srs-*.md
 ```
 
 Read every file found. Do not selectively load or search — load the entire knowledge base. Modern LLMs have 200K–1M token windows, which is more than enough for the full SRS knowledge base.
 
 ## Knowledge Files
 
-All files are in `openclaw/memory/` within the SRS repo:
+All files are in `$SRS_ROOT/openclaw/memory/`:
 
 - **srs-overview.md** — Core reference: what SRS is, supported protocols and codecs, transmuxing/transcoding, sources (Live/SRT/RTC), configuration (`conf/` files and env vars), ecosystem tools, dependencies, community context, performance notes, feature list with versions/dates
 
diff --git a/openclaw/skills/st-develop/SKILL.md b/openclaw/skills/st-develop/SKILL.md
new file mode 100644
index 000000000..8eb1d494e
--- /dev/null
+++ b/openclaw/skills/st-develop/SKILL.md
@@ -0,0 +1,67 @@
+---
+name: st-develop
+description: Anything related to coroutines, State Threads (ST), or SRS's concurrency model. Use when discussing coroutine concepts, updating coroutine knowledge (srs-coroutines.md), developing/debugging/porting ST source code, porting ST to new CPU architectures or OSes, debugging coroutine context switching, analyzing ST scheduler behavior, adding new platform assembly, fixing ASAN/Valgrind/SEH issues, or understanding ST internals (sched, stk, sync, key, io, event, context switch ASM).
+---
+
+# ST Development
+
+State Threads (ST) is a C coroutine library. Source lives in `trunk/3rdparty/st-srs/` inside the SRS repo.
+
+Default SRS repo path is `~/git/srs`, but do **not** hardcode this path.
+Always resolve `SRS_ROOT` dynamically:
+
+1. If `SRS_ROOT` env is set and contains `trunk/3rdparty/st-srs`, use it.
+2. Else, if current workspace (or its git root) contains `trunk/3rdparty/st-srs`, use that.
+3. Else, if `~/git/srs/trunk/3rdparty/st-srs` exists, use `~/git/srs`.
+4. Else, ask the user for the SRS repo root.
+
+All ST source paths below are relative to `$SRS_ROOT`.
+
+## Setup: Load Knowledge Base (MANDATORY)
+
+Before any ST work, use the `read` tool to load the knowledge base. Do NOT use memory_search — read the full file directly.
+
+- `memory/srs-coroutines.md`
+
+## Loading ST Source Code (ON REQUEST)
+
+When the user asks to load the ST codebase (or needs you to work directly with the source), load **ALL** ST source files — no partial loads.
+
+All under `$SRS_ROOT/trunk/3rdparty/st-srs/`:
+
+Headers: `public.h`, `common.h`, `md.h`
+
+Core C: `sched.c`, `stk.c`, `sync.c`, `key.c`, `io.c`, `event.c`, `common.c`
+
+Platform ASM: `md_darwin.S`, `md_linux.S`, `md_linux2.S`, `md_cygwin64.S`
+
+Build: `Makefile`
+
+**Load every single file listed above — no shortcuts, no skipping.**
+
+## Unit Tests (utest)
+
+ST has a Google Test-based unit test suite in `$SRS_ROOT/trunk/3rdparty/st-srs/utest/`:
+
+- `st_utest.cpp` / `st_utest.hpp` — Test main and shared helpers
+- `st_utest_coroutines.cpp` — Coroutine tests (start, params, multiple coroutines, addition across yields)
+- `st_utest_tcp.cpp` — TCP connection test
+- `gtest-fit/` — Embedded Google Test framework
+
+**Build targets** (in the ST Makefile):
+- `darwin-debug-utest` — macOS debug build + utest
+- `linux-debug-utest` — Linux debug build + utest
+- `cygwin64-debug-utest` — Cygwin64 debug build + utest
+
+Coverage variants: `darwin-debug-gcov`, `linux-debug-gcov` (adds `-fprofile-arcs -ftest-coverage`).
+
+The build compiles ST as a static library first, then builds and links the utest binary at `obj/st_utest`.
+
+## Verifying Changes
+
+After any ST change (including utest-only changes), run the verifier script in this skill folder (not in the ST codebase):
+
+- `scripts/verify.sh`
+
+This script must resolve `SRS_ROOT` dynamically and run unit tests in `$SRS_ROOT/trunk/3rdparty/st-srs`.
+Always run verification before considering a change complete.
diff --git a/openclaw/skills/st-develop/scripts/verify.sh b/openclaw/skills/st-develop/scripts/verify.sh
new file mode 100755
index 000000000..a0bd045a9
--- /dev/null
+++ b/openclaw/skills/st-develop/scripts/verify.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Verify ST changes by building and running unit tests.
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+# Navigate: scripts/ -> st-develop/ -> skills/ -> openclaw/ -> srs/
+SRS_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)"
+ST_DIR="$SRS_ROOT/trunk/3rdparty/st-srs"
+
+if [[ ! -d "$ST_DIR" ]]; then
+  echo "Error: ST_DIR does not exist: $ST_DIR" >&2
+  exit 1
+fi
+
+echo "ST source: $ST_DIR"
+
+CMAKE_DIR="$SRS_ROOT/cmake"
+BUILD_DIR="$SRS_ROOT/cmake/build"
+
+mkdir -p "$BUILD_DIR"
+cd "$BUILD_DIR"
+
+cmake .. -DCMAKE_BUILD_TYPE=Debug
+cmake --build . --target st_utest
+
+./st-build/st_utest
diff --git a/trunk/3rdparty/srs-docs/blog/2024-10-18-Hidden-Flaws-of-SRS.md b/trunk/3rdparty/srs-docs/blog/2024-10-18-Hidden-Flaws-of-SRS.md
new file mode 100644
index 000000000..4c9864d06
--- /dev/null
+++ b/trunk/3rdparty/srs-docs/blog/2024-10-18-Hidden-Flaws-of-SRS.md
@@ -0,0 +1,595 @@
+# The Hidden Flaws of SRS: What You Need to Know
+
+In open-source streaming server projects, SRS is widely used and has been maintained for a long time, starting in 2013. Over the years, various issues have been addressed, but some remain challenging to solve. This article summarizes and shares these insights.
+
+Some people think SRS has many features and is complete, but it is far from a perfect open-source project and never will be. Anyone can contribute to the SRS project at any time, and participation is always welcome.
+
+## Cluster
+The cluster capability is a significant weakness of SRS. There have been ongoing debates and design revisions regarding the types of clusters it actually supports.
+
+SRS has supported Edge clusters since version 2.0 in 2015, claiming to be capable of large-scale RTMP/FLV distribution. However, after a decade of development, it was found that very few users actually utilize Edge, especially considering the lack of support for protocols like SRT and WebRTC. There is a notable disparity in capabilities between open-source project users and what video cloud services require in terms of media streaming clusters.
+
+In my assessment, SRS doesn’t necessarily need to support large-scale distribution but rather focus on small to medium-sized clusters. However, even in smaller clusters, it’s essential to support thousands or tens of thousands of concurrent connections. Moreover, it’s crucial to have robust cluster capabilities for mainstream protocols, not just limited to RTMP/FLV.
+
+When discussing media streaming clusters, apart from the media processing server SRS, there are also scheduling and operational systems involved. Even with RTMP URL access systems, there is still DNS resolution and scheduling. While WebRTC signaling typically uses HTTPS, separate from media transport, clusters inherently involve scheduling and distribution capabilities.
+
+Firstly, SRS lacks cluster scheduling capabilities as it’s primarily an open-source media server. To build a million-concurrent-user media streaming cluster, one needs to address scheduling independently. CDNs typically use DNS or HTTP-DNS scheduling. Scheduling essentially determines the system’s capacity utilization and operational costs.
+
+Secondly, the cluster architecture could be tree-based (source-edge) or graph-based (forwarding or cascading) but there isn’t a definitive answer. In China, due to multiple operators, tree architecture is common for handling massive playback issues in live streaming. However, for building a global network or scenarios with massive cold streams (like network conferences), a tree structure is inadequate.
+
+Lastly, load balancing for streaming servers, including load collection and overload protection, is usually balanced internally within the system. While scheduling acts as the first line of load balancing defense, internal load balancing is crucial. Although SRS has implemented circuit breaking, it hasn’t yet implemented load collection and balancing, like redirecting overloaded traffic to other servers through 302 redirects. For more on load balancing, refer to the Load Balancing Streaming Servers summary.
+
+Below is the source-edge architecture of SRS, where multiple Edge servers retrieve streams from the Origin server. Each stream is fetched from the Origin only once per Edge, and Edges can also fetch streams from other Edges, creating a multilevel Edge structure to support massive playback:
+
+```text
+SRS(Edge)
+SRS(Origin) --RTMP/FLV---> SRS(Edge) --RTMP/FLV--> Massive Playback
+                           SRS(Edge)
+```
+
+> Note: This Edge architecture was initially derived from Adobe AMS. However, AMS used a modified RTMP protocol with custom messages for cluster management, while SRS uses a standard RTMP protocol, achieving a standard RTMP client implementation.
+
+* Supports only RTMP/FLV protocols, not SRT/WebRTC/HLS. WebRTC protocol, in particular, requires significant cluster support for handling more concurrency due to its performance differences.
+* Typically, Nginx is needed for HLS distribution and supporting HTTPS-FLV protocol. SRS’s performance in HTTPS and HLS distribution is low and may lead to stuttering issues due to disk I/O blocking.
+* Source retrieval is a fixed configuration, forming a tree-like distribution network that can be challenging to adjust. For multiple source retrievals, source server clusters or retrieval based on vhost are required.
+
+Next is the source server cluster architecture of SRS, where multiple source servers connect in a mesh, exchanging stream information. They use RTMP 302 redirection to direct clients to the target source server, dispersing streams to multiple sources to support more streams:
+
+```text
+SRS(Origin)
+   +(MESH)  ---RTMP---> SRS(Edge) --RTMP/FLV--> Streaming or Playback
+SRS(Origin)
+```
+
+* Source servers connect in a mesh, limiting the number of supported source servers.
+* If a source server has no stream, RTMP 302 is used to direct the client to the target source server, relying on Edge to handle RTMP 302 redirection messages.
+* When streams migrate between source servers, HLS recovery and generation issues are not handled well, leading to interruptions in HLS streams.
+
+In reality, this source server cluster isn’t an ideal solution, especially for the WebRTC protocol. Since SRS is single-threaded and WebRTC protocol performance is notably low, each SRS source server can support around 300 WebRTC connections. Enabling WebRTC and RTMP protocol conversion may support around 40 connections (with audio transcoding). While live streaming protocols may support 3000–5000 concurrent users, WebRTC is on a different scale, necessitating new source server cluster solutions.
+
+The new source server cluster architecture of SRS uses Proxy servers to forward traffic to backend source servers. Proxies are stateless and horizontally scalable, addressing many streaming scenarios:
+
+```text
+SRS(Origin)
+SRS(Origin) ---> SRS(Proxy) --> Streaming or Playback
+SRS(Origin)
+```
+
+* Source servers are not interdependent and register with the Proxy. Proxies are stateless and synchronize status through Redis, making deployment in K8s straightforward.
+* Proxies support all protocols: RTMP/FLV/HLS/SRT/WebRTC, proxying API and media traffic to source servers, supporting other media servers besides SRS, offering high versatility.
+* Despite the new cluster architecture, HLS recovery issues remain unresolved.
+
+Considering the WebRTC protocol, using Proxy to connect 100 SRS Origins, even with WebRTC to RTMP protocol conversion, can support up to 4000 streams, with each stream accommodating up to 300 viewers. This setup can handle a maximum of 4000 streams or support 100 streams with 30K viewers each.
+
+Note: SRS also supports Dynamic Forwarding, actively forwarding streams to different backend services, essentially acting as a proxy. The difference lies in SRS Proxy being implemented in Go, relying on Redis for statelessness, facilitating system integration and various business scheduling logic. On the other hand, SRS Forwarding is a feature implemented by SRS, supporting only the RTMP protocol, and making modifications can be complex.
+
+The new source server cluster can be used in conjunction with Edge servers, still supporting numerous RTMP streams and massive playback for each stream:
+
+```text
+SRS(Origin)                     SRS(Edge)
+SRS(Origin) ---> SRS(Proxy) --> SRS(Edge) --> Streaming or Playback
+SRS(Origin)                     SRS(Edge)
+```
+
+* Edge servers continue to support only RTMP/FLV protocols, but future support for WebRTC protocol is planned to enable massive playback for live streaming and RTC scenarios.
+* Proxies and Origins are typically regionally deployed. To deploy clusters in different regions, scheduling is usually based on vhost, assigning different vhosts to different regions.
+* Monitoring and full traceability of clusters are not fully supported by Proxy. Additionally, HLS stream recovery remains inadequate.
+
+Apart from top-tier video clouds and CDNs requiring millions of streaming and playback capabilities, in general applications, handling thousands of streams and playback is the scale most business scenarios can achieve. For smaller applications, WebRTC relies on clusters to address performance bottlenecks, while other protocols focus on avoiding single points of failure to improve availability.
+
+Cluster implementation for RTMP/FLV protocols is relatively straightforward. For Edge servers, RTMP/FLV is stateless, so if issues arise, switching to the next source server is usually sufficient. Clients may experience minor content shifts or repetitions, but these are generally imperceptible. When saying stateless, it means these states can be discarded, and errors can be resolved through retries. If an Edge server encounters issues, clients simply retry, relying on client retries for error recovery.
+
+Implementing a cluster for HLS protocol is challenging, especially in handling stream switching. HLS protocol can handle client stream reconnections correctly, using DISCONTINUITY tags to ensure continuous segments. However, if a source server crashes or restarts, SRS currently lacks persistent storage and recovery of segment information, leading to M3U8 content regeneration and client disruptions. Similar issues arise when streams migrate between source servers. HLS clusters need to address these exceptional scenarios.
+
+Another challenging aspect of HLS protocol is data statistics like connection numbers. Since HLS is segment-based and lacks a concept of connections, SRS introduced HLS_CTX to achieve connection statistics through QueryString. However, for HLS clusters, Edge servers need to implement HLS distribution and connection statistics, a feature not yet available in SRS. If using Nginx for Edge HLS distribution, connection statistics support needs to be added to Nginx.
+
+For SRT and WebRTC, SRS currently only supports Proxy, enabling stream scheduling to different Origins, but Edge servers do not support these two protocols. This limitation means that if a stream has a large number of viewers, SRT or WebRTC protocols cannot be used. Addressing how to achieve comprehensive protocol stack support on Edge servers is a challenging aspect of clusters.
+
+Typically, K8s or HELM are used to manage clusters, including deployment, upgrades, scaling, shrinking, and rollbacks. SRS clusters consider K8s support, with a critical capability being Gracefully Quit, where ports are closed upon exit, and services are restarted only after existing connections gradually close, enabling smooth upgrades. Gracefully Quit is easily implemented for TCP protocols, but UDP protocol support is still pending.
+
+For end-to-end tracing, SRS passes connection IDs and other information in the RTMP protocol and supports the OpenTelemetry standard protocol, integrating with Tencent Cloud’s APM. However, APM lacks an open-source platform like Prometheus, and integrating with different APM cloud services requires some effort. Additionally, SRS Proxy does not yet support APM instrumentation extensively. SRS’s own APM instrumentation is limited, providing only essential information and not comprehensive coverage.
+
+Using Go to implement Proxy poses potential performance bottlenecks. As Go handles media traffic, it may not achieve the same high performance as C++, especially on multi-core CPUs, potentially leading to a 30% performance loss. While this issue may not be significant in small clusters, most scenarios do not require such high performance levels. RUST could be an alternative option, but maintaining C++, Go, and RUST simultaneously increases community maintenance complexity and incurs high costs.
+
+Regarding clusters, there are considerations around multi-process or multi-threading issues.
+
+## Multi-processing
+Multi-core processing or multi-threading is essential for servers as they typically support multiple CPU cores, enabling a single machine to offer more services externally. From a technical standpoint, there are various solutions:
+
+* Multi-process: For instance, Nginx operates as a multi-process system, where each process is single-threaded. Due to Nginx’s influence, most servers of its era adopt this architecture. The challenge with this architecture lies in handling long-lived connections, such as those in live media streaming, where two connections need to be dispatched to a single process. In essence, multi-processing isn’t an ideal architecture for a media streaming server, which explains why this architecture is rarely seen in modern media servers — it’s challenging to maintain.
+* Multi-threading: Servers predating Nginx, like Adobe AMS, Real Helix, Wowza, Janus, used a multi-threaded architecture. However, this architecture is outdated due to performance losses from thread switching and the potential for thread and data race issues leading to crashes and deadlocks. Over the last decade, multi-threading has evolved into a thread-local architecture, seen in systems like Envoy and ZLM. Thread-local architecture is essentially multi-process, but with the advantage of shared data between processes. This architecture is the most appealing evolution direction for SRS, but it hasn’t fully convinced SRS to adopt it yet — it’s not quite good enough, as I’ll explain later.
+* Cluster architecture: K8s falls under this category, functioning as a distributed architecture that virtualizes a single machine into a K8s cluster. Each Pod can be a single process, and multiple Pods and Services form a cluster to provide services externally. SRS’s Proxy-Origin-Edge cluster operates on this architecture. While this architecture introduces a Proxy element that may impact performance, it offers advantages in maintenance and scalability. K8s not only addresses media distribution issues but also handles deployment, scaling, monitoring, updates, and more.
+
+SRS has traditionally been single-process, single-threaded, akin to a single-process version of Nginx, with the addition of coroutines for concurrent processing. Coroutines are implemented using the StateThreads library, which has been modified to support thread-local functionality for operation in a multi-threaded environment. Despite experimenting and analyzing thread-local handling for a media architecture over the years, SRS has not adopted a thread-local approach but rather a different multi-threaded architecture that is still in the planning stage:
+
+* Stream processing occurs on a single thread, while blocking operations like logging, file writing, and DNS resolution are handled by separate threads. In essence, SRS uses multi-threading to address blocking issues. If Linux supports fully asynchronous I/O in the future, multi-threading may not be necessary, as seen in liburing.
+* StateThreads multi-threading faces issues with Windows C++ exception handling. Windows’ exception mechanism differs from Linux, causing compatibility problems when StateThreads implements setjmp and longjmp, as discussed in SEH.
+* Challenges with multi-thread scheduling and load balancing. While thread-local multi-threading addresses multi-core utilization, it still limits the need for streaming and playback to a single thread, preventing complete load balancing across multiple threads. Without thread-local functionality, serious locking and competition issues arise. Essentially, it’s like running multiple K8s Pods within a single process and handling scheduling, monitoring, and load balancing internally, which can be quite complex.
+
+In SRS 5.0, StateThreads were restructured to support thread-local functionality and initiated a main thread and subthreads to transition the architecture into a multi-threaded model. However, various issues arose during subsequent stages, leading to a default return to a single-threaded architecture in SRS 6.0. The future may see the removal of multi-threading capabilities as they become less critical with the new Proxy and Edge architectures. If Proxy continues to enhance its capabilities, encompassing various protocols and Edge functionality, it will gradually evolve into a Proxy+Origin cluster, fully resolving the multi-threading challenge.
+
+Additionally, we explored another potential architecture where specific capabilities are distributed across different threads, like using separate threads for WebRTC encryption and decryption. However, this approach transforms into a typical multi-threaded program rather than a thread-local architecture, resulting in performance overhead from locks and reduced stability — not an ideal direction.
+
+In essence, Go can be considered a traditional multi-threaded architecture, not a thread-local architecture, which is why Go also requires locks, contributing to its lower performance. For media streaming clusters, Go may not be the best language as performance losses increase costs for media servers. While C++ is prone to crashing issues, which can’t be fundamentally resolved, RUST might be a potential option, albeit requiring research and experimentation.
+
+The stability issues in medium to large C++ projects are indeed a pain point, especially when introducing SharedPtr for Source object release.
+
+## Smart Pointer
+The memory leakage issue in SRS took approximately 10 years to resolve, as detailed in the Source cleanup discussion.
+
+In reality, it wasn’t exactly a leak but rather a failure to release caches. Initially, SRS focused on handling a small number of RTMP streams with a large number of viewers, prioritizing the release of client connections over Stream Source objects. This approach was chosen because most memory was consumed by client connections, while Stream Source objects occupied minimal memory. By not releasing Stream Source objects, the system was simplified, avoiding many memory issues and ensuring high stability.
+
+However, if there were frequent URL changes for streams or a high volume of publishing with few viewers, rapid memory escalation became evident.
+
+Releasing Stream Source objects requires addressing various object references to Stream Source. Since SRS supports protocols like RTMP, HTTP-FLV, HLS, SRT, WebRTC, DASH, GB28181, and their conversions, scenarios involving mutual references arise.
+
+Consider an RTMP client pushing to SRS, creating a Stream Source for each URL:
+
+```cpp
+srs_error_t SrsRtmpConn::stream_service_cycle() {
+    SrsLiveSource* source = NULL;
+    _srs_sources->fetch_or_create(req, server, &source);
+}
+```
+
+When starting a stream, an HTTP mount is created to enable HTTP FLV playback:
+
+```cpp
+srs_error_t SrsHttpStreamServer::http_mount(SrsLiveSource* s, SrsRequest* r) {
+    entry = new SrsLiveEntry(mount);
+    entry->source = s;
+    entry->stream = new SrsLiveStream(s, r, entry->cache);
+    mux.handle(mount, entry->stream);
+}
+
+srs_error_t SrsLiveStream::do_serve_http(ISrsHttpResponseWriter* w, ISrsHttpMessage* r) {
+    SrsLiveConsumer* consumer = NULL;
+    source->create_consumer(consumer);
+}
+```
+
+If the Source remains valid, business logic can be easily implemented. However, if cleanup is needed, various scenarios must be considered, significantly increasing the risk of dangling pointers:
+
+* During unpublish, HTTP Streams must be cleaned up, requiring clients to be disconnected first, and Source can only be released after the connection is destroyed.
+* Since SRS allows playback before publishing, all HTTP Streams must be disconnected before Source cleanup.
+* In Edge mode, playing an HTTP Stream creates a Source, triggering a fetch from the Origin to publish. Cleanup is triggered when the last client disconnects, posing a self-destruction risk for HTTP Streams.
+
+In practice, C++ 11’s shared pointer, with some advanced features, not only manages memory through reference counting but also includes:
+
+* shared_from_this: This common ability returns a shared pointer from a raw pointer. However, since the raw pointer is managed by a shared pointer, creating a new shared pointer directly could lead to double releases. This feature isn't essential, so SRS hasn't implemented it.
+* inheritance and compare: Involving inheritance and comparison, these scenarios deal with smart pointer inheritance and comparison, which aren't necessary for SRS.
+* weak pointer: If there's a circular reference, shared pointer reference counting may fail, necessitating the use of weak pointers to avoid circular references. Weak pointers are similar to raw pointers but provide a function to check if the shared pointer is still valid. In SRS's context, circular references can be avoided, so weak pointers aren't implemented.
+
+After implementing simplified smart pointers, Stream Source is managed using timers for checking and releasing. This standard pointer usage involves releasing the Source smart pointer when the reference count reaches zero:
+
+```cpp
+srs_error_t SrsLiveSourceManager::notify(int event, srs_utime_t interval, srs_utime_t tick) {
+    std::map< std::string, SrsSharedPtr<SrsLiveSource> >::iterator it;
+    for (it = pool.begin(); it != pool.end();) {
+        SrsSharedPtr<SrsLiveSource>& source = it->second;
+        source->cycle();
+
+        if (source->stream_is_dead()) {
+            pool.erase(it++); // Free source smart ptr
+}
+```
+
+To address circular reference issues, comments are used to avoid circular references. Directly using raw pointers can help, for instance:
+
+```cpp
+// Source holds and releases hub
+class SrsLiveSource {
+    SrsOriginHub* hub;
+}
+
+class SrsOriginHub : public ISrsReloadHandler {
+    // Because source references to this object, so we should directly use the source ptr.
+    SrsLiveSource* source_;
+}
+```
+
+Another scenario involves two objects triggering destruction, but their lifecycles aren’t independent. For instance, in the RTC context, a session like SrsRtcConnection contains SrsRtcTcpConn. When the session times out, the TCP connection must be closed. Conversely, when the TCP connection closes, it must trigger session destruction:
+
+```cpp
+// Session contains the TCP connection
+class SrsRtcConnection {
+    SrsRtcNetworks* networks_;
+}
+class SrsRtcTcpNetwork: public ISrsRtcNetwork {
+    SrsSharedResource<SrsRtcTcpConn> owner_;
+}
+
+// TCP connection directly uses the session's raw pointer, as the session's lifecycle is shorter
+class SrsRtcTcpConn {
+    // Because session references to this object, we should directly use the session ptr.
+    SrsRtcConnection* session_;
+}
+
+// When TCP disconnects, trigger session expiration and destruction
+srs_error_t SrsRtcTcpConn::cycle() {
+    // Only remove session when network is established, because client might use other UDP network.
+    if(session_ && session_->tcp()->is_establelished()) {
+        session_->tcp()->set_state(SrsRtcNetworkStateClosed);
+        session_->expire();
+    }
+}
+```
+
+Dealing with GB28181 is more complex, where a session includes both Sip and Media components. Although there are no circular reference issues, the session may be created from Sip, and Sip and Media may need to update the session:
+
+```cpp
+class SrsGbSession {
+    SrsSharedResource<SrsGbSipTcpConn> sip_;
+    SrsSharedResource<SrsGbMediaTcpConn> media_;
+}
+
+class SrsGbSipTcpConn {
+    // The owner session object; we use the raw pointer and avoid freeing it.
+    SrsGbSession* session_;
+}
+
+class SrsGbMediaTcpConn {
+    // The owner session object; we use the raw pointer and avoid freeing it.
+    SrsGbSession* session_;
+}
+
+srs_error_t SrsGbSipTcpConn::bind_session(SrsSipMessage* msg, SrsGbSession** psession) {
+    SrsSharedResource<SrsGbSession>* session = dynamic_cast<SrsSharedResource<SrsGbSession>*>(_srs_gb_manager->find_by_id(device));
+    // Create a Session object from the SIP channel
+    if (!session) {
+        raw_session = new SrsGbSession();
+        session = new SrsSharedResource<SrsGbSession>(raw_session);
+        _srs_gb_manager->add_with_id(device, session);
+    }
+    // Update the SIP channel object in the Session.
+    raw_session->on_sip_transport(*wrapper_);
+}
+
+srs_error_t SrsGbMediaTcpConn::bind_session(uint32_t ssrc, SrsGbSession** psession) {
+    SrsSharedResource<SrsGbSession>* session = dynamic_cast<SrsSharedResource<SrsGbSession>*>(_srs_gb_manager->find_by_fast_id(ssrc));
+    SrsGbSession* raw_session = (*session).get();
+    // Update the Media channel object in the Session.
+    raw_session->on_media_transport(*wrapper_);
+}
+```
+
+In contrast to RTC, for GB, the SIP and Media channels, when released, only free their connection objects and don’t trigger GB Session destruction. Instead, GB Session is destroyed through timeouts.
+
+In practical applications, SRS’s smart pointers are used in limited scenarios, without applying advanced features or syntactic sugar. This approach enhances the maintainability of the solution.
+
+## Error vs Logging
+In most cases, errors will eventually occur in areas prone to issues, making error handling crucial for system operation. Errors and logging are often intertwined, with a common practice being to log error information in the logs. However, errors and logging are two distinct matters.
+
+Logging involves recording system actions and can be used to troubleshoot issues. Errors represent a specific type of behavior, and the relationship between errors and logs includes:
+
+* Logs are categorized, enabling the opening of more detailed logs for investigating challenging issues. Errors do not have levels, but errors can be written to logs.
+* Errors should provide comprehensive stack traces since the call path is critical when an error occurs. Errors typically occur in lower-level functions, making understanding and resolving errors dependent on different call paths.
+* In C++, errors can be expressed using error codes and exceptions. Error codes are more commonly used and facilitate proper error handling.
+
+Errors may not always occur, but issues like irregular frame rates can still arise. Therefore, logs typically need to indicate critical paths without including errors to assess system normalcy:
+
+```text
+[2024-09-27 10:32:46.245][INFO][58467][f951w1of] RTMP client ip=127.0.0.1:57591, fd=13
+[2024-09-27 10:32:46.246][INFO][58467][f951w1of] complex handshake success
+[2024-09-27 10:32:46.247][INFO][58467][f951w1of] connect app, tcUrl=rtmp://localhost:1935/live, pageUrl=, swfUrl=, schema=rtmp, vhost=localhost, port=1935, app=live, args=null
+[2024-09-27 10:32:46.247][INFO][58467][f951w1of] protocol in.buffer=0, in.ack=0, out.ack=0, in.chunk=128, out.chunk=128
+[2024-09-27 10:32:46.247][INFO][58467][f951w1of] client identified, type=fmle-publish, vhost=localhost, app=live, stream=livestream, param=, duration=0ms
+[2024-09-27 10:32:46.247][INFO][58467][f951w1of] connected stream, tcUrl=rtmp://localhost:1935/live, pageUrl=, swfUrl=, schema=rtmp, vhost=__defaultVhost__, port=1935, app=live, stream=livestream, param=, args=null
+[2024-09-27 10:32:46.247][INFO][58467][f951w1of] new live source, stream_url=/live/livestream
+[2024-09-27 10:32:46.248][INFO][58467][f951w1of] source url=/live/livestream, ip=127.0.0.1, cache=1/2500, is_edge=0, source_id=/
+[2024-09-27 10:32:46.250][INFO][58467][f951w1of] start publish mr=0/350, p1stpt=20000, pnt=5000, tcp_nodelay=0
+[2024-09-27 10:32:46.251][INFO][58467][f951w1of] got metadata, width=768, height=320, vcodec=7, acodec=10
+[2024-09-27 10:32:46.251][INFO][58467][f951w1of] 46B video sh, codec(7, profile=High, level=3.2, 768x320, 0kbps, 0.0fps, 0.0s)
+[2024-09-27 10:32:46.251][INFO][58467][f951w1of] 4B audio sh, codec(10, profile=LC, 2channels, 0kbps, 44100HZ), flv(16bits, 2channels, 44100HZ)
+[2024-09-27 10:32:46.253][INFO][58467][f951w1of] RTMP2RTC: Init audio codec to 10(AAC)
+[2024-09-27 10:32:48.385][INFO][58467][f951w1of] cleanup when unpublish
+```
+
+For server logs, it’s essential to include IDs, such as session-level IDs. This enables quick identification of specific logs when serving multiple clients or when aggregating logs. SRS refers to this mechanism as “traceable logs,” allowing all logs related to a specific ID to be easily found. Due to SRS’s coroutine-based design, a session like an RTMP push connection may contain one or multiple coroutines. Hence, when logging, passing the ID isn’t necessary as it’s automatically obtained:
+
+```cpp
+srs_error_t SrsRtmpConn::do_cycle() {
+    srs_trace("RTMP client ip=%s:%d, fd=%d", ip.c_str(), port, srs_netfd_fileno(stfd));
+}
+
+void SrsProtocol::print_debug_info() {
+    srs_trace("protocol in.buffer=%d, in.ack=%d, out.ack=%d, in.chunk=%d, out.chunk=%d", in_buffer_length,
+        in_ack_size.window, out_ack_size.window, in_chunk_size, out_chunk_size);
+}
+
+// This log macro definition automatically fetches the coroutine ID from get_id().
+#define srs_trace(msg, ...) srs_logger_impl(SrsLogLevelTrace, NULL, _srs_context->get_id(), msg, ##__VA_ARGS__)
+```
+
+SRS errors contain stack information and error details, resembling Go’s error and wrap mechanism. With this approach, the error object alone provides the complete error context:
+
+```text
+[2024-09-27 10:40:30.836][INFO][62805][l9067692] RTMP client ip=127.0.0.1:57942, fd=15
+[2024-09-27 10:40:30.837][INFO][62805][l9067692] client identified, type=fmle-publish, vhost=localhost, app=live, stream=livestream, param=, duration=0ms
+[2024-09-27 10:40:30.838][ERROR][62805][l9067692][35] serve error code=1028(StreamBusy)(Stream already exists or busy) : service cycle : rtmp: stream service : rtmp: stream /live/livestream is busy
+thread [62805][l9067692]: do_cycle() [./src/app/srs_app_rtmp_conn.cpp:263][errno=35]
+thread [62805][l9067692]: service_cycle() [./src/app/srs_app_rtmp_conn.cpp:457][errno=35]
+thread [62805][l9067692]: acquire_publish() [./src/app/srs_app_rtmp_conn.cpp:1078][errno=35](Resource temporarily unavailable)
+```
+
+In SRS, errors are objects that provide complete context and can be sent through a tracing system, like OpenTelemetry APM, for full-chain error display. This necessitates that SRS errors are wrapped rather than directly returned as integers:
+
+```cpp
+srs_error_t SrsRtmpConn::do_cycle() {
+    if ((err = service_cycle()) != srs_success) {
+        err = srs_error_wrap(err, "service cycle");
+    }
+}
+
+srs_error_t SrsRtmpConn::service_cycle() {
+        if (!srs_is_system_control_error(err)) {
+            return srs_error_wrap(err, "rtmp: stream service");
+        }
+}
+
+srs_error_t SrsRtmpConn::acquire_publish(SrsSharedPtr<SrsLiveSource> source) {
+    if (!source->can_publish(info->edge)) {
+        return srs_error_new(ERROR_SYSTEM_STREAM_BUSY, "rtmp: stream %s is busy", req->get_stream_url().c_str());
+    }
+}
+```
+
+It’s common practice not to return error objects but to directly log errors, which can be a less sophisticated approach. For instance, low-level function errors that can be ignored might lead to log flooding if directly printed, prompting the function to return error codes instead. This can result in losing stack information. By returning error objects, the application layer can decide whether to discard or ignore errors, log warnings, log errors, or send alerts to monitoring systems. Directly logging errors fundamentally limits error handling possibilities.
+
+Challenges with SRS’s error mechanism include:
+
+* All functions must return srs_error_t pointer objects, not int error codes, making code writing somewhat rigid and relying solely on Code Review to maintain this rule.
+* When calling third-party libraries like FFmpeg, complete stack traces cannot be obtained, and only error code information is available. These libraries typically provide log hooks to route internal logs to SRS’s logs, but C libraries generally do not offer error object mechanisms.
+* High-frequency ignorable errors, like certain UDP packet errors, can lead to performance issues. Since UDP packet errors are numerous, generating an error object for each error can create performance bottlenecks. However, erroneous UDP packets are uncommon and can be directly discarded through early judgment.
+
+Initially, SRS used a globally incremented integer for logging, but later switched to generating random string IDs to reduce the likelihood of ID conflicts when logs are centrally collected. Besides IDs, timestamps are included, and logs are typically searched within a limited timeframe, such as one or three hours. The probability of random string ID conflicts is almost negligible.
+
+SRS 6.0 now supports OpenTelemetry APM for application performance monitoring, full-chain logging, and errors. Since APM requires integration with platforms, SRS integrates with Tencent Cloud’s APM. The APM protocol stack involves HTTP/2 and Protobuf, with SRS implementing the Protobuf protocol but supporting only the HTTP/1 protocol. While most APM platforms support integration using HTTP/1 + Protobuf, it’s not the default method. The use of newer protocols and inconsistent authentication mechanisms across cloud providers limits APM applications.
+
+In practice, system OpenAPIs are more commonly used than errors and logs, as detailed below.
+
+## OpenAPI
+API refers to the system’s external interface, where in a narrow sense, API generally refers to HTTP API. In reality, logs and errors can be considered as a broader form of API, including system configurations, HTTP APIs, Prometheus Exporter, and more.
+
+SRS has supported HTTP API and Callback since its early days, making it easy to integrate with business systems. Apart from supporting queries for stream and client information, it also enables actions like kicking off streams. Callback includes callbacks for events such as streaming, playback, recording, HLS, etc. Here are some areas that still need improvement:
+
+* Only supports HTTP Basic Authentication, lacking support for Bearer Token and other authentication methods. It is generally recommended to implement an HTTP proxy in Go for authentication and then forward requests to SRS. For instance, Oryx has implemented such a proxy.
+* Documentation is not comprehensive, only outlining request and response formats, including field information without detailed explanations of field meanings. Initially, this was due to the API’s instability, with the intention to explain field meanings in detail once stabilized, but this was not followed through.
+* Limited pagination and query capabilities; the clients API may contain a significant amount of data, requiring support for paginated queries. For example, it lacks the ability to search based on stream names fuzzily; only specific information can be queried based on IDs. It also lacks sorting based on time. It is generally recommended to maintain a database of streams and clients based on callbacks to support more comprehensive queries.
+
+It is evident that providing APIs and Callbacks is not sufficient; processing of this data is necessary, supporting comprehensive queries, and even graphical representation. For operations and maintenance, monitoring alerts and historical query support are needed. These can be achieved through Prometheus and Grafana using SRS’s Exporter:
+
+```text
++-----+               +-----------+     +---------+
+| SRS +--Exporter-->--| Prometheus +-->--+ Grafana +
++-----+   (HTTP)      +-----------+     +---------+
+```
+
+Prometheus is a comprehensive monitoring and operations system that fetches data from SRS through a standardized Exporter API, stores it in its own time-series database, and uses its query language PromQL for querying. Grafana enables graphical representation. This approach eliminates the need for each user to convert HTTP APIs and Callbacks for their own monitoring and operations systems. By utilizing SRS’s Prometheus Exporter functionality, you can set up a graphical monitoring and operations dashboard within 10 minutes. For more details, refer to SRS Exporter. Here are some areas where the Exporter can be improved:
+
+* Supports limited metrics, currently only supporting machine-level monitoring data and lacking support for stream or client-level data. For instance, you can know the connection count, bandwidth, stream count, and client count for a specific SRS or entire cluster, but you cannot access statistics for a specific stream.
+* The provided dashboard is relatively simple, offering only a common Grafana dashboard. Different scenarios may require different Grafana dashboards.
+* Data on error rates and success rates are not precise enough. Typically, such data needs to be provided by clients; the server can only provide success rates for certain scenarios. Additionally, SRS’s current success rate is not accurate enough.
+
+Apart from data metrics, configuration can also be considered part of the interface, determining how the system behaves. SRS uses the configuration file method similar to nginx and supports reloading. Due to users managing configuration files, SRS ensures strong compatibility. Configuration files from SRS 1.0 can still be used in subsequent versions; whenever there are changes like renaming, SRS ensures compatibility with both old and new names after parsing the configuration. Here are some areas where configuration can be improved:
+
+* Configuration files are not very friendly in cloud-native systems; in reality, using environment variables is a more convenient configuration method. Referring to Grafana’s configuration, besides using file configuration, each configuration item can be controlled using environment variables. While SRS has implemented some configuration using environment variables, it does not yet support environment variable configuration at the vhost level.
+* At one point, SRS supported an HTTP RAW API that allowed configuration modification and serialization into configuration files, enabling configuration distribution. However, this method led to many issues, easily causing data conflicts and competition, ultimately leading to crashes. Consequently, this feature was removed in official versions. In cloud-native environments, configurations are managed through environment variables in YAML, requiring only YAML changes for configuration adjustments.
+* SRS supports configuration reloading, especially useful for system performance optimization. By changing the configuration and reloading, you can quickly verify if optimizations are effective without needing to restart the entire load test. However, this feature is overly complex; certain functions like changing listening ports require very intricate implementations, significantly impacting maintainability and stability. In the future, this feature may be gradually weakened or removed.
+
+Comparing configuration files and environment variable configurations, below is an example configuration file example.conf:
+
+```nginx
+listen 1935;
+http_api {
+  enabled on;
+}
+rtc_server {
+  enabled on;
+  candidate 192.168.3.82;
+}
+vhost __defaultVhost__ {
+  rtc {
+    enabled on;
+  }
+}
+```
+
+```bash
+docker run --rm -it -p 1985:1985 -p 8000:8000 \
+  -v $(pwd)/example.conf:/usr/local/srs/conf/docker.conf \
+  ossrs/srs:5
+```
+
+Describing the configuration file content in the documentation requires users to copy content, create and edit files, and then specify the file during startup, which is cumbersome and prone to errors. In contrast, using environment variables, everything can be done with a simple copy-paste, reducing the likelihood of user errors:
+
+```bash
+docker run --rm -it -p 1985:1985 -p 8000:8000 \
+  -e SRS_LISTEN=1935 \
+  -e SRS_HTTP_API_ENABLED=on \
+  -e SRS_RTC_SERVER_ENABLED=on \
+  -e CANDIDATE="192.168.3.82" \
+  -e SRS_VHOST_RTC_ENABLED=on \
+  ossrs/srs:5
+```
+
+Maintaining a comprehensive API requires significant effort and time, especially when adding new features to ensure that both the API and documentation are promptly updated. SRS’s API is far from perfect and requires further effort.
+
+## Protocols
+Streaming protocols are crucial capabilities of streaming servers, and the protocol capabilities required on servers differ from general media processing. For example, the discontinuation of ffserver was due to FFmpeg’s I/O design primarily focusing on clients, making it unsuitable for handling high concurrency as a server. For instance, SRS does not use a WebRTC codebase to implement the WebRTC protocol because SFU servers require lightweight protocol stacks without the need for media processing, device management, signaling, and algorithms.
+
+SRS natively supports protocols like RTMP, HTTP-FLV, HLS, and WebRTC, ensuring high stability and maintainability. SRT is integrated with the libsrt library for stability, but it lacks perfection in protocol conversion and performance. Protocols like DASH and GB28181 have inherent flaws leading to various issues. HDS and RTSP are outdated streaming protocols, but RTSP still finds application in certain scenarios in the AI era.
+
+Initially, SRS only supported common live streaming protocols: RTMP, HTTP-FLV, and HLS, along with live clusters such as Edge and Origin clusters. With the evolution of live streaming, protocols like WebRTC and SRT are increasingly used, significantly impacting the entire system architecture, especially in protocol conversion.
+
+Initially, protocol conversion only required support for RTMP to HLS. Although OBS can stream using HLS, it is not commonly used. In reality, SRS also supports streaming via POST HTTP-FLV, similar to POST HLS, but FLV streaming is not widely supported by CDNs and common clients, making this method less popular. However, SRS uses the Stream Caster structure to support this unique stream input and then converts it to RTMP; for instance, PUSH MPEGTS over UDP follows this method.
+
+Initially, both SRT and GB28181 used the Stream Caster structure; after SRS receives the stream, it initiates an RTMP Client to stream to localhost, somewhat resembling an external method of receiving these two protocols’ streams. These protocols encounter different issues; starting with SRT:
+
+* SRS did not fully implement the SRT protocol stack but used the libsrt library, which has its threads. This caused many problems in early SRS support for SRT and posed high maintenance challenges. Later, in SRS 5.0, this logic was rewritten, still using the libsrt library but modified based on StateThreads, improving overall stability and maintainability.
+* SRT uses UDP for data transmission, and Linux kernel’s UDP transmission performance is inherently low. Additionally, SRT uses TS over UDP, resulting in additional application-level overhead. Consequently, SRT is primarily suited for scenarios with low concurrency rather than high concurrency.
+* SRS supports both pushing and pulling streams using SRT, supports SRT to RTMP and WebRTC protocols but does not support RTMP to SRT conversion. Essentially, SRS treats SRT as an ingress protocol rather than a distribution protocol.
+
+The situation with GB28181 is different; in SRS 5.0, GB28181 protocol stack was completely re-implemented. Typically, GB28181 streams to SRS for viewing via WebRTC without plugins, addressing issues such as:
+
+* GB28181 is designed for intranet use and does not consider scenarios like packet loss and jitter, making it usable only with TCP on the internet. Hence, after rewriting the GB28181 protocol in SRS, only GB28181 2016 (TCP) protocol stack is supported, with UDP not implemented.
+* GB28181 includes various business capabilities like playback, storage, PTZ, with control messages implemented using SIP. SRS has a basic SIP protocol stack based on HTTP transformation, suitable for simple scenarios. Subsequently, a separate srs-sip project was implemented to allow SRS to use the GB28181 protocol without the built-in SIP stack, although this solution is not yet comprehensive, covering only basic functionalities.
+* GB28181 is primarily used in the domestic security field, emphasizing security, resulting in many private information and protocols. In reality, GB28181 has minimal internet application and is predominantly used in the security field. Projects focusing on private networks and security are challenging to maintain in open-source projects, making it difficult for open-source projects to excel in this area.
+
+The RTSP protocol was removed from SRS, initially supported through the Stream Caster method for RTSP stream ingestion. Main issues include:
+
+* Lack of RTSP streaming scenarios; security cameras typically do not use RTSP to stream to SRS but rather play streams using the RTSP protocol from security cameras. This is why SRS supports GB28181 streaming, although GB28181 also has limited internet application.
+* Unlike WebRTC, RTSP lacks robust congestion control capabilities, making it prone to stuttering and screen freezing issues during internet transmission. Consequently, RTSP is not an ideal protocol for streaming media over the internet.
+* Given that AI applications often involve object recognition in security cameras, many AI systems prefer using the RTSP protocol for integration. However, the future trend is towards RTMP and WebRTC, not RTSP. For example, Google’s first-generation cameras only support RTSP, while the second generation supports WebRTC.
+
+> Note: SRS is willing to support the RTSP protocol in the future, but the long-term assessment indicates that the primary streaming protocols will remain RTMP, WebRTC, and SRT, rather than RTSP. The community’s willingness and resources to support and maintain a declining RTSP protocol remain uncertain.
+
+SRS’s RTMP implementation is undoubtedly the most comprehensive, addressing several issues but also having areas for improvement:
+
+* Enhanced RTMP protocol now supports HEVC, AV1, Opus, and other new encoding standards, allowing avoidance of audio transcoding when converting to the WebRTC protocol. Currently, SRS only supports HEVC in RTMP and has yet to support AV1 and Opus.
+
+SRS supports WebRTC as a core protocol because it is the only option for streaming protocols supported by browsers. However, there are several areas where WebRTC implementation in SRS can be improved:
+
+* Inadequate support for congestion control algorithms in WebRTC; currently, only NACK is supported, lacking FEC and GCC. This is partly due to the lower performance of SRS’s WebRTC protocol stack and the complexity and limited applicability of these algorithms.
+* Lack of support for WebRTC clusters; while SRS 7.0 supports WebRTC Origin clusters (Proxy solution) for expanding source station support, Edge clusters are not yet supported for high viewer counts per stream. Expected support for Edge clusters is targeted for SRS 8.0.
+* Performance and compatibility challenges; the single-process architecture makes it difficult to enhance performance as WebRTC performance bottlenecks exist at both the application and kernel levels. Continuous adaptation and issue resolution are necessary to improve protocol conversion compatibility. Rust is being evaluated as a potential solution for these challenges; refer to the subsequent section on Rust for detailed information.
+
+SRS’s segmented protocols like HLS, DASH, and HDS, with HLS being the most widely used, HDS nearly obsolete, and DASH having some users but facing certain issues:
+
+* SRS’s HLS does not support multiple bitrates, as video transcoding is required for multi-bitrate HLS, which can only be achieved using FFmpeg. SRS’s HLS also lacks support for MP4 and LLHLS. SRS already supports DASH, including MP4 encapsulation, and supporting LLHLS is relatively straightforward.
+* DASH is not as friendly for live streaming as HLS in terms of protocol design simplicity and reliability; refer to DASH’s issues for a detailed description.
+
+MP4 can also be considered a protocol commonly used in recording files; LLHLS also utilizes MP4, especially for HLS HEVC, as Safari only supports MP4 segmented files, not TS segmented files. MP4 compatibility issues are more common than with RTMP, occasionally leading to various unexpected problems. It is recommended to use FFmpeg for recording and transcoding, while relying on SRS for stream reception and distribution only.
+
+## Testing
+SRS’s quality assurance mechanisms include unit tests based on gtest, black-box testing based on srs-bench, and Code Review. These mechanisms are interconnected through GitHub Actions, automatically triggered for every Pull Request, Commit, and Release.
+
+These tests are highly effective, often leading to Pull Requests failing due to logic issues caused by code modifications in other areas. It seems that the test cases are not as effective, essentially confirming common-sense judgments like 1+1=2. In reality, errors are seldom introduced in the modified areas but rather surface in other parts due to the intricate relationships between different sections of code. As code grows, these interdependencies often lead to such issues.
+
+Programmers typically have great confidence in their own code but lack confidence in others’ code. Few developers voluntarily write test cases, and when asked to add unit tests during Code Review, most acknowledge the necessity of testing, especially after realizing how tests can uncover logic issues. Personally, even when recognizing the effectiveness of testing, voluntarily writing tests remains a challenging task, making it difficult to adhere to the practice of writing tests before functional code.
+
+I believe that one of the most crucial roles of Code Review is to require the submission of test code. Of course, the primary function of Code Review is to integrate a contributor’s code into one’s own codebase, necessitating an understanding of the code’s purpose and intent, and exploring better implementation options. Common issues include developers unknowingly duplicating existing code functions or failing to leverage existing code for improvements due to unfamiliarity with the current codebase. Additionally, contributors often prioritize habitual implementations over the most maintainable solutions.
+
+In Code Review, open-source communities typically adhere to a single rule: one Pull Request should contain only one feature or bug fix. Due to limited time in open-source communities, including multiple features in a single change makes explaining the code to different individuals challenging, requiring detailed explanations of each function’s purpose, making the review process difficult. Conversely, in a corporate setting, almost every change includes some “convenient” modifications, leading to ineffective Code Reviews and significant code quality issues.
+
+Moreover, although companies have ample time for coding, the urgency from clients often results in a rush to deploy numerous features and changes, leading to Pull Requests containing multiple features. Companies spend less time on code than open-source communities, failing to allocate sufficient time to weigh the most maintainable solutions. This is why open-source communities are more likely to produce high-quality code, whereas companies are more prone to lower-quality code, despite having top-tier programmers.
+
+This is not to say that open-source communities always produce high-quality code; Code Review standards can easily slip, especially when introducing significant code changes or large features. The cases of SRS’s GB28181 and RTSP are typical examples where, due to substantial changes and time constraints, insufficient Review time led to a compromise in code quality. This resulted in numerous bug reports, necessitating the rewriting of existing code and the diligent addition of tests and tools.
+
+It is essential to emphasize that the issue is not with the low quality of code submitted by other developers but rather the lack of sufficient time spent during Code Review, resulting in lower code quality post-merge. For instance, when reviewing GB28181 code, I did not invest time in thoroughly understanding the GB28181 protocol, reviewing each line of code meticulously, demanding comprehensive testing, or conducting stress and black-box testing. This was simply negligence on my part, failing to enforce strict Review standards, for which I take full responsibility.
+
+There are significant issues with SRS’s testing, primarily inadequate coverage. Despite a 53% coverage rate, some core functionalities remain uncovered, such as the Edge cluster. At times, there is a tendency towards laziness; for example, even with the development of the Proxy cluster, there was no test coverage. Although the importance of test coverage is well understood, at times, it is challenging to implement, with the thought that the feature is still unstable and testing can be added once it stabilizes.
+
+Code quality is independent of programming language; without effective code quality standards and strict adherence to these standards, any language can produce low-quality code.
+
+## RUST
+It’s evident that Go is not the future of streaming servers due to several reasons:
+
+* Go incurs significant performance losses, with multithreading and garbage collection leading to a 30% increase in overall operational costs, which can be unsustainable for an online system. While Go’s technology stack can be used for learning about streaming or quickly prototyping, the performance issues of Go render it inadequate for long-term development in the streaming server domain.
+* The disparity between Go’s ecosystem and C poses challenges when incorporating C code into Go, as using Cgo can result in numerous memory issues. This essentially means abandoning Go’s goroutines in favor of dealing with more memory issues from C, making it challenging to continuously rewrite the entire ecosystem from C to Go, as seen in projects like pion.
+* Go is not a replacement for C and C++, implying that for future development, the community and projects relying on C cannot depend on Go for improvements and enhancements. Fundamentally, Go is not the solution for streaming servers, regardless of the efforts invested in this direction.
+
+Life goes on, projects need to progress, new capabilities must be developed, and we cannot wait; action must be taken. Is RUST a suitable path forward? It might be worth exploring if RUST addresses the key challenges faced by streaming servers and if the issues introduced by RUST are manageable.
+
+Firstly, let’s revisit some key themes mentioned earlier:
+
+* Cluster: The cluster capabilities, whether implemented in C or RUST, essentially involve similar logic for cluster implementation, including Proxy proxies, Origin stream processing and conversion, Edge aggregation, and more.
+* Multi-processing: RUST may have a slight advantage over C in terms of multi-threading capabilities. With the need for asynchronous and coroutine technologies, C faces some challenges as described earlier. RUST, being a newer language, provides encapsulation for threads at the language level, improving portability between systems with better support for async operations and basic communication components like channels.
+* Smart Pointer: RUST, similar to C++11, offers smart pointers in its standard library, providing a potential advantage over C, where developers need to implement smart pointers themselves.
+
+Apart from these issues, there are additional considerations that haven’t been discussed yet, which I will summarize below.
+
+* Dangling Pointers: RUST has a clear advantage in handling dangling pointers, as RUST does not encounter issues with dangling pointers, preventing crashes caused by such pointers. This problem fundamentally arises from C’s flexibility without restrictions, potentially leading to issues like accessing variables across threads, resulting in dangling pointers.
+
+In C++, the following code snippet does not have dangling pointer issues:
+```cpp
+int x = 100;
+std::thread t([&]() {
+  std::cout << "x is " << x << std::endl;
+});
+t.join();
+```
+
+However, if the thread does not release as expected and references a variable after its local scope, it can lead to dangling pointer issues. This is a real problem that I have encountered. On the other hand, in RUST:
+```rust
+let x = 100;
+thread::spawn(move || {
+    println!("x is {}", x);
+}).join().unwrap();
+```
+
+This problem does not exist in RUST, as the compiler ensures that variables must be moved into the thread. RUST restricts sharing regular variables between threads due to potential risks. Although sharing in this case is not problematic, as the variable will be valid until the thread ends, it is not always the case, especially when continually maintaining a project and adding logic incrementally. Dangling pointer issues are quite common, and I have often seen them during crash troubleshooting in real projects.
+
+RUST, like any other language, has its strengths and weaknesses. It solves some problems while introducing others:
+
+* Steep Learning Curve: RUST introduces ownership mechanisms to address memory issues without a garbage collector, which can be challenging to grasp when combined with ownership, multithreading, and async concepts. Understanding these concepts becomes increasingly difficult. While GPT-4 may struggle to explain these complex ideas and compiler errors, the good news is that O1 works well.
+* Optional Async Mechanism: RUST’s async behavior is similar to Python, where not all functions are async. If an IO library does not implement async, you need to do it yourself. In contrast, Go follows an all-async strategy, with all libraries supporting async operations. RUST’s std library has limited async and await support, with runtime being a third-party library like tokio, meaning tokio needs to implement its async std, and other libraries also need to support async in their own way. If a third-party library includes threads, it may not work seamlessly with tokio, which contains a thread pool.
+* Limited std and Inconsistent Third-party Libraries: While RUST’s std documentation is comprehensive and of high quality, it lacks the extensive libraries found in Go’s std. Many third-party RUST libraries have varying levels of documentation and quality, posing challenges for the open-source community.
+
+For server programs, supporting asynchronous IO (async) is essential. RUST’s async and ST are quite similar in this regard. Most async IO operations are based on polling, such as Linux epoll. Nginx, for instance, directly uses epoll. When reading and writing, partial writes may occur, indicating that the buffer is full, requiring readiness polling before writing again. Dealing with these issues for every read and write can lead to cumbersome application logic, resulting in callback hell. Go and ST create application-level coroutines, while RUST also uses spawn to execute a future. Comparing the code snippets:
+
+```go
+//////////////////////////////////////////////////////////////////////////////
+// Go
+listener, _ := net.Listen("tcp", "0.0.0.0:8080")
+for {
+    conn, _ := listener.Accept()
+    go handleTCP(conn)
+}
+
+// Goroutine handling the TCP connection
+func handleTCP(conn net.Conn) {
+    defer conn.Close()
+    buf := make([]byte, 1024)
+    n, _ := conn.Read(buf)
+}
+```
+
+```c
+//////////////////////////////////////////////////////////////////////////////
+// ST(State Threads)
+int fd = socket(AF_INET, SOCK_STREAM, 0);
+::bind(fd, (const sockaddr*)&addr, sizeof(addr)); // addr is the listening address
+::listen(fd, 10);
+st_netfd_t stfd = st_netfd_open_socket(fd);
+for {
+    st_netfd_t client = st_accept(stfd, NULL, NULL, ST_UTIME_NO_TIMEOUT);
+    st_thread_create(serverTCP, client, 0, 0);
+}
+
+// ST coroutine serving the TCP connection
+void* serverTCP(void* arg) {
+    st_netfd_t client = (st_netfd_t)arg;
+    char buf[1024];
+    int n = st_read(client, buf, sizeof(buf), ST_UTIME_NO_TIMEOUT);
+}
+```
+
+```rust
+//////////////////////////////////////////////////////////////////////////////
+// RUST async
+let listener = tokio::net::TcpListener::bind("0.0.0.0:8080").await.unwrap();
+loop {
+    let (socket, _) = listener.accept().await.unwrap();
+    tokio::spawn(processTCP(socket));
+}
+
+// Handling the TCP connection using async
+async fn processTCP(mut socket: tokio::net::TcpStream) {
+    let mut buf = vec![0; 1024];
+    let n = socket.read(&mut buf).await;
+}
+```
+
+RUST Runtime Support: RUST’s runtime supports both single-threaded and multi-threaded environments, allowing tasks to potentially move between different threads when using spawn and await. In contrast, ST only supports thread-local operations, requiring the application layer to handle how tasks are passed, which is not yet supported. Go’s M and N model provides a sophisticated multi-threaded scheduler implemented at the language level, offering the highest level of support.
+
+Performance Analysis: ST relies entirely on assembly for coroutine switching without a multithreading mechanism, with a codebase of around 5,000 lines. On the other hand, RUST’s tokio runtime delivers high performance without the impact of garbage collection. In the realm of high-performance servers, RUST tokio emerges as a suitable solution, especially given its third-party async library that allows for customization and modification. In terms of maintainability, RUST tokio outperforms ST.
+
+Compatibility: Both RUST and Go excel in terms of compatibility. ST in SRS requires manual assembly for coroutine switching, necessitating adaptation to hardware registers and function call mechanisms when encountering new hardware and CPU chips. Particularly, ST currently lacks compatibility with Windows SEH exception mechanisms. RUST and Go support a wide range of systems and CPUs without compatibility issues.
+
+## Next
+Streaming media falls within the realm of media technology, which can vary across different countries. Some countries may regulate media, limiting the availability of services in the media sector. However, the global user and developer base of SRS confirms its practical value. Our efforts can continue to have a positive impact on this planet, making it worthwhile to persist in our endeavors. It’s unfortunate that nginx-rtmp ceased maintenance, despite having a larger user base compared to SRS.
+
+The architecture of SRS primarily focuses on maintainability. Rather than adding more features, it is essential to listen to diverse voices from different countries and regions and explore various technological solutions and languages. I believe RUST could be an interesting pursuit. Even if it may not ultimately materialize, the process can yield many intriguing ideas, similar to how SRS incorporates many concepts from Go. Additionally, without in-person interactions, building an effective open-source community is challenging. It’s important to participate in open-source conferences and exchanges in different countries to foster such a community.
+
+Maintaining SRS involves encountering numerous fascinating technical challenges, engaging discussions, and diverse viewpoints. While these joys may not translate into monetary gains or serve as a means of livelihood, solely focusing on survival can be stifling. Conversely, even if these joys do not directly contribute to survival, they can still warm the heart and bring fulfillment.
diff --git a/trunk/3rdparty/st-srs/.gitignore b/trunk/3rdparty/st-srs/.gitignore
index 20f61304e..b9f037ced 100644
--- a/trunk/3rdparty/st-srs/.gitignore
+++ b/trunk/3rdparty/st-srs/.gitignore
@@ -12,3 +12,5 @@ googletest-*
 coverage
 codecov
 *.dSYM
+
+/cmake/build/
diff --git a/trunk/3rdparty/st-srs/ide/st_clion/CMakeLists.txt b/trunk/3rdparty/st-srs/cmake/CMakeLists.txt
similarity index 88%
rename from trunk/3rdparty/st-srs/ide/st_clion/CMakeLists.txt
rename to trunk/3rdparty/st-srs/cmake/CMakeLists.txt
index d54d8d3b4..a575a1e4d 100644
--- a/trunk/3rdparty/st-srs/ide/st_clion/CMakeLists.txt
+++ b/trunk/3rdparty/st-srs/cmake/CMakeLists.txt
@@ -1,3 +1,6 @@
+# CMake minimum version should be called first
+cmake_minimum_required(VERSION 3.10)
+
 # Name of the project.
 # Language "C" is required for find_package(Threads).
 if (CMAKE_VERSION VERSION_LESS 3.0)
@@ -6,14 +9,13 @@ else()
     cmake_policy(SET CMP0048 NEW)
     project(st VERSION 4.0.0 LANGUAGES CXX C ASM)
 endif()
-cmake_minimum_required(VERSION 2.8.12)
 
 # For utest required C++11.
 set (CMAKE_CXX_STANDARD 11)
 
 ###########################################################
 execute_process(
-        COMMAND bash -c "cd ${PROJECT_SOURCE_DIR}/../../ && pwd"
+        COMMAND bash -c "cd ${PROJECT_SOURCE_DIR}/../ && pwd"
         OUTPUT_VARIABLE ST_DIR
 )
 string(STRIP ${ST_DIR} ST_DIR)
@@ -26,12 +28,12 @@ ProcessorCount(JOBS)
 
 # We should always configure ST for switching between branches.
 IF (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    ADD_DEFINITIONS("-arch x86_64 -DDARWIN -DMD_HAVE_KQUEUE -DMD_HAVE_SELECT -DDEBUG")
+    ADD_DEFINITIONS("-DDARWIN -DMD_HAVE_KQUEUE -DMD_HAVE_SELECT -DDEBUG")
 ELSE ()
     ADD_DEFINITIONS("-DLINUX -DMD_HAVE_EPOLL -DMD_HAVE_SELECT -DDEBUG")
 ENDIF ()
 
-EXEC_PROGRAM("cd ${ST_DIR} && mkdir -p obj && cp public.h obj/st.h")
+execute_process(COMMAND bash -c "cd ${ST_DIR} && mkdir -p obj && cp public.h obj/st.h")
 
 ###########################################################
 # For whole project.
@@ -51,11 +53,13 @@ ELSE ()
     list(APPEND SOURCE_FILES ${ST_DIR}/md_linux2.S)
 ENDIF ()
 
-ADD_DEFINITIONS("-g -O0")
+ADD_DEFINITIONS("-g -O0 -fsanitize=address -fno-omit-frame-pointer")
 
 ###########################################################
 # Setup ST utest project
-ADD_SUBDIRECTORY(${ST_DIR}/utest/gtest-fit gtest-fit)
+if(NOT TARGET gtest)
+    ADD_SUBDIRECTORY(${ST_DIR}/utest/gtest-fit gtest-fit)
+endif()
 INCLUDE_DIRECTORIES(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
 
 set(ST_UTEST_SOURCE_FILES ${SOURCE_FILES})
@@ -66,6 +70,7 @@ TARGET_LINK_LIBRARIES(st_utest gtest gtest_main)
 TARGET_LINK_LIBRARIES(st_utest dl)
 TARGET_LINK_LIBRARIES(st_utest ${DEPS_LIBS})
 TARGET_LINK_LIBRARIES(st_utest -ldl -pthread)
+TARGET_LINK_LIBRARIES(st_utest -fsanitize=address -fno-omit-frame-pointer)
 
 ###########################################################
 # Setup tools/backtrace project
diff --git a/trunk/3rdparty/st-srs/event.c b/trunk/3rdparty/st-srs/event.c
index c6e125b51..5965649ac 100644
--- a/trunk/3rdparty/st-srs/event.c
+++ b/trunk/3rdparty/st-srs/event.c
@@ -63,7 +63,7 @@ __thread unsigned long long _st_stat_epoll_spin = 0;
 
 
 #ifdef MD_HAVE_SELECT
-static struct _st_seldata {
+static __thread struct _st_seldata {
     fd_set fd_read_set, fd_write_set, fd_exception_set;
     int fd_ref_cnts[FD_SETSIZE][3];
     int maxfd;
@@ -432,7 +432,8 @@ ST_HIDDEN int _st_select_fd_getlimit(void)
 
 ST_HIDDEN void _st_select_destroy(void)
 {
-    /* TODO: FIXME: Implements it */
+    free(_st_select_data);
+    _st_select_data = NULL;
 }
 
 static _st_eventsys_t _st_select_eventsys = {
diff --git a/trunk/3rdparty/st-srs/key.c b/trunk/3rdparty/st-srs/key.c
index 86c042b8b..3fd602aa3 100644
--- a/trunk/3rdparty/st-srs/key.c
+++ b/trunk/3rdparty/st-srs/key.c
@@ -49,8 +49,8 @@
 /*
  * Destructor table for per-thread private data
  */
-static _st_destructor_t _st_destructors[ST_KEYS_MAX];
-static int key_max = 0;
+static __thread _st_destructor_t _st_destructors[ST_KEYS_MAX];
+static __thread int key_max = 0;
 
 
 /*
diff --git a/trunk/3rdparty/st-srs/md_darwin.S b/trunk/3rdparty/st-srs/md_darwin.S
index 7a3c2eee1..9ef05ce5e 100644
--- a/trunk/3rdparty/st-srs/md_darwin.S
+++ b/trunk/3rdparty/st-srs/md_darwin.S
@@ -117,8 +117,9 @@
     /* r29 and r30 are used as the frame register and link register (avoid) */
     #define JB_X29           10
     #define JB_LR            11
-    /* Register '31' is one of two registers depending on the instruction context:
-        For instructions dealing with the stack, it is the stack pointer, named rsp */
+    /* In AArch64, register encoding '31' is context-dependent (there is no "x31"):
+        - In stack-related instructions, it refers to SP (the stack pointer)
+        - In other instructions, it refers to XZR (the zero register) */
     #define JB_SP   13
 
     /* FP registers */
diff --git a/trunk/3rdparty/st-srs/md_linux2.S b/trunk/3rdparty/st-srs/md_linux2.S
index 0ee21ffa5..ff0294fa4 100644
--- a/trunk/3rdparty/st-srs/md_linux2.S
+++ b/trunk/3rdparty/st-srs/md_linux2.S
@@ -29,8 +29,9 @@
     /* r29 and r30 are used as the frame register and link register (avoid) */
     #define JB_X29           10
     #define JB_LR            11
-    /* Register '31' is one of two registers depending on the instruction context:
-        For instructions dealing with the stack, it is the stack pointer, named rsp */
+    /* In AArch64, register encoding '31' is context-dependent (there is no "x31"):
+        - In stack-related instructions, it refers to SP (the stack pointer)
+        - In other instructions, it refers to XZR (the zero register) */
     #define JB_SP		 13
 
     /* FP registers */
diff --git a/trunk/3rdparty/st-srs/utest/Makefile b/trunk/3rdparty/st-srs/utest/Makefile
index 2cd304b65..a95b691e2 100644
--- a/trunk/3rdparty/st-srs/utest/Makefile
+++ b/trunk/3rdparty/st-srs/utest/Makefile
@@ -15,6 +15,18 @@ WARNFLAGS += -Wall -Wno-deprecated-declarations -Wno-unused-private-field -Wno-u
 # House-keeping build targets.
 all : $(ST_DIR)/obj/st_utest
 
+# Auto-setup: symlink gtest-fit from SRS 3rdparty if not present.
+$(GTEST_DIR)/include/gtest/gtest.h:
+	@if [ ! -d "$(ST_UTEST)/gtest-fit" -a -d "$(ST_UTEST)/../../gtest-fit" ]; then \
+		echo "Auto-link gtest-fit from $(ST_UTEST)/../../gtest-fit"; \
+		ln -sf ../../gtest-fit $(ST_UTEST)/gtest-fit; \
+	fi
+	@if [ ! -f "$@" ]; then \
+		echo "Error: gtest not found. Please download from https://github.com/google/googletest/releases/tag/release-1.11.0"; \
+		echo "  or symlink: ln -sf /path/to/gtest-fit $(ST_UTEST)/gtest-fit"; \
+		exit 1; \
+	fi
+
 clean :
 	rm -f $(ST_DIR)/obj/st_utest* $(ST_DIR)/obj/gtest*
 
@@ -26,7 +38,7 @@ GTEST_SRCS_ = $(GTEST_DIR)/src/*.cc $(GTEST_DIR)/src/*.h $(GTEST_DIR)/include/gt
 # implementation details, the dependencies specified below are
 # conservative and not optimized.  This is fine as Google Test
 # compiles fast and for ordinary users its source rarely changes.
-$(ST_DIR)/obj/gtest-all.o : $(GTEST_SRCS_)
+$(ST_DIR)/obj/gtest-all.o : $(GTEST_DIR)/include/gtest/gtest.h $(GTEST_SRCS_)
 	$(CXX) -c $(GTEST_DIR)/src/gtest-all.cc -o $@ \
         $(CXXFLAGS) $(UTEST_FLAGS) \
         $(WARNFLAGS) \
@@ -61,7 +73,7 @@ OBJECTS_FILES = $(patsubst %.cpp,%.o,$(SOURCE_FILES))
 OBJECTS = $(addprefix $(ST_DIR)/obj/,$(OBJECTS_FILES))
 
 # Objects, build each object of utest
-$(ST_DIR)/obj/%.o : %.cpp $(ST_UTEST_DEPS) $(UTEST_DEPS)
+$(ST_DIR)/obj/%.o : %.cpp $(ST_UTEST_DEPS) $(UTEST_DEPS) $(GTEST_DIR)/include/gtest/gtest.h
 	$(CXX) -c $< -o $@ \
         $(CXXFLAGS) $(UTEST_FLAGS) \
         $(WARNFLAGS) \
diff --git a/trunk/3rdparty/st-srs/utest/st_utest.hpp b/trunk/3rdparty/st-srs/utest/st_utest.hpp
index 7b6e5d4e2..24143fc59 100644
--- a/trunk/3rdparty/st-srs/utest/st_utest.hpp
+++ b/trunk/3rdparty/st-srs/utest/st_utest.hpp
@@ -12,6 +12,7 @@
 
 #include <st.h>
 #include <string>
+#include <memory>
 
 #define VOID
 
@@ -44,7 +45,7 @@ struct ErrorObject {
 };
 extern std::ostream& operator<<(std::ostream& out, const ErrorObject* err);
 #define ST_ASSERT_ERROR(error, r0, message) if (error) return new ErrorObject(r0, message)
-#define ST_COROUTINE_JOIN(trd, r0) ErrorObject* r0 = NULL; if (trd) st_thread_join(trd, (void**)&r0); SrsUniquePtr<ErrorObject> r0_uptr(r0)
+#define ST_COROUTINE_JOIN(trd, r0) ErrorObject* r0 = NULL; if (trd) st_thread_join(trd, (void**)&r0); std::unique_ptr<ErrorObject> r0##_uptr(r0)
 #define ST_EXPECT_SUCCESS(r0) EXPECT_TRUE(!r0) << r0
 #define ST_EXPECT_FAILED(r0) EXPECT_TRUE(r0) << r0
 
diff --git a/trunk/3rdparty/st-srs/utest/st_utest_learn_kb.cpp b/trunk/3rdparty/st-srs/utest/st_utest_learn_kb.cpp
new file mode 100644
index 000000000..53e5e035d
--- /dev/null
+++ b/trunk/3rdparty/st-srs/utest/st_utest_learn_kb.cpp
@@ -0,0 +1,802 @@
+/* SPDX-License-Identifier: MIT */
+/* Copyright (c) 2013-2026 The SRS Authors */
+
+#include <st_utest.hpp>
+
+#include <st.h>
+#include <stdint.h>
+#include <errno.h>
+#include <string.h>
+
+#include <sys/socket.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#define ST_UTIME_MILLISECONDS 1000
+#define ST_UTEST_TIMEOUT (100 * ST_UTIME_MILLISECONDS)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Unit tests for context switching: verify that _st_md_cxt_save/_st_md_cxt_restore
+// and st_thread_create's save-then-patch-SP trick actually work.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Test: a coroutine runs on a different stack than the primordial thread.
+// This proves the SP-patching trick in st_thread_create works.
+static void* coroutine_stack_addr(void* arg)
+{
+    int local_var = 42;
+    // Write our stack variable address back to the caller.
+    *(uintptr_t*)arg = (uintptr_t)&local_var;
+    return NULL;
+}
+
+VOID TEST(LearnKB, CoroutineRunsOnSeparateStack)
+{
+    uintptr_t primordial_stack_addr = 0;
+    uintptr_t coroutine_stack = 0;
+
+    // Capture primordial thread's stack address.
+    int local = 0;
+    primordial_stack_addr = (uintptr_t)&local;
+
+    st_thread_t trd = st_thread_create(coroutine_stack_addr, &coroutine_stack, 1, 0);
+    ASSERT_TRUE(trd != NULL);
+    st_thread_join(trd, NULL);
+
+    ASSERT_NE((uintptr_t)0, coroutine_stack);
+
+    // The coroutine's stack should be far from the primordial stack.
+    // Default stack is 128KB, so the difference should be at least that much.
+    uintptr_t diff = (primordial_stack_addr > coroutine_stack)
+        ? (primordial_stack_addr - coroutine_stack)
+        : (coroutine_stack - primordial_stack_addr);
+    EXPECT_GT(diff, (uintptr_t)4096) << "Coroutine stack should be on a separate heap-allocated stack";
+}
+
+// Test: context switches preserve local state across yields.
+// Each coroutine writes to its own local variable, yields, then checks the value.
+// This proves save/restore of registers and stack is correct.
+static void* coroutine_preserve_local(void* arg)
+{
+    int id = *(int*)arg;
+    int local_state = id * 1000;
+
+    // Yield to let other coroutines run.
+    st_usleep(0);
+
+    // After resume, local_state should be intact — it lives on our own stack.
+    local_state += 1;
+
+    // Write result back.
+    *(int*)arg = local_state;
+    return NULL;
+}
+
+VOID TEST(LearnKB, LocalStatePreservedAcrossYield)
+{
+    int a = 1, b = 2, c = 3;
+    st_thread_t t1 = st_thread_create(coroutine_preserve_local, &a, 1, 0);
+    st_thread_t t2 = st_thread_create(coroutine_preserve_local, &b, 1, 0);
+    st_thread_t t3 = st_thread_create(coroutine_preserve_local, &c, 1, 0);
+    ASSERT_TRUE(t1 && t2 && t3);
+
+    st_thread_join(t1, NULL);
+    st_thread_join(t2, NULL);
+    st_thread_join(t3, NULL);
+
+    // Each coroutine computed: id * 1000 + 1
+    EXPECT_EQ(1001, a);
+    EXPECT_EQ(2001, b);
+    EXPECT_EQ(3001, c);
+}
+
+// Test: context switching via st_thread_yield works correctly.
+// We create coroutines that increment a shared counter in a deterministic order
+// using yields. This verifies the scheduler + context switch round-trips.
+static int g_counter = 0;
+
+static void* coroutine_yield_order(void* arg)
+{
+    int my_order = *(int*)arg;
+
+    // Wait until it's our turn.
+    while (g_counter < my_order) {
+        st_usleep(0);
+    }
+
+    // It's our turn — increment.
+    g_counter++;
+    return NULL;
+}
+
+VOID TEST(LearnKB, YieldOrderPreserved)
+{
+    g_counter = 0;
+
+    int order0 = 0, order1 = 1, order2 = 2;
+    st_thread_t t0 = st_thread_create(coroutine_yield_order, &order0, 1, 0);
+    st_thread_t t1 = st_thread_create(coroutine_yield_order, &order1, 1, 0);
+    st_thread_t t2 = st_thread_create(coroutine_yield_order, &order2, 1, 0);
+    ASSERT_TRUE(t0 && t1 && t2);
+
+    st_thread_join(t0, NULL);
+    st_thread_join(t1, NULL);
+    st_thread_join(t2, NULL);
+
+    EXPECT_EQ(3, g_counter);
+}
+
+// Test: return value from coroutine is correctly passed through st_thread_join.
+// This proves the full lifecycle: create (save+patch SP) → schedule (restore) →
+// run → exit (save retval) → join (read retval).
+static void* coroutine_retval(void* arg)
+{
+    int input = *(int*)arg;
+    st_usleep(0);
+    // Return a computed value as void*.
+    return (void*)(intptr_t)(input * input);
+}
+
+VOID TEST(LearnKB, ReturnValueThroughJoin)
+{
+    int input = 7;
+    st_thread_t trd = st_thread_create(coroutine_retval, &input, 1, 0);
+    ASSERT_TRUE(trd != NULL);
+
+    void* retval = NULL;
+    st_thread_join(trd, &retval);
+
+    EXPECT_EQ(49, (int)(intptr_t)retval);
+}
+
+// Test: start routine must NOT execute inline in st_thread_create.
+// It should run only after scheduler handoff.
+static int g_create_started = 0;
+
+static void* coroutine_mark_started(void* /*arg*/)
+{
+    g_create_started++;
+    return NULL;
+}
+
+VOID TEST(LearnKB, StartRoutineNotExecutedInline)
+{
+    g_create_started = 0;
+
+    st_thread_t trd = st_thread_create(coroutine_mark_started, NULL, 1, 0);
+    ASSERT_TRUE(trd != NULL);
+
+    // Creator path: _st_md_cxt_save returns 0, so _st_thread_main is not run inline.
+    EXPECT_EQ(0, g_create_started) << "Coroutine must not run inline inside st_thread_create";
+
+    // After yielding, scheduler can run the created coroutine.
+    st_usleep(0);
+    EXPECT_EQ(1, g_create_started) << "Coroutine should run after scheduler handoff";
+
+    st_thread_join(trd, NULL);
+}
+
+static void* coroutine_run_once(void* arg)
+{
+    int* runs = (int*)arg;
+    (*runs)++;
+    return NULL;
+}
+
+VOID TEST(LearnKB, JoinDrivesFirstRunWhenNoManualYield)
+{
+    int runs = 0;
+
+    st_thread_t trd = st_thread_create(coroutine_run_once, &runs, 1, 0);
+    ASSERT_TRUE(trd != NULL);
+
+    // Still not run yet, because creator hasn't yielded.
+    EXPECT_EQ(0, runs);
+
+    // Join blocks current coroutine and hands control to scheduler.
+    st_thread_join(trd, NULL);
+    EXPECT_EQ(1, runs);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// The utest for st_mutex cooperative lock/wakeup workflow.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct MutexLearnCtx {
+    st_mutex_t lock;
+    int* order;
+    int* index;
+    int id;
+};
+
+static void* coroutine_mutex_fifo_waiter(void* arg)
+{
+    MutexLearnCtx* ctx = (MutexLearnCtx*)arg;
+
+    int r0 = st_mutex_lock(ctx->lock);
+    ST_ASSERT_ERROR(r0 != 0, r0, "waiter failed to lock mutex");
+
+    ctx->order[*ctx->index] = ctx->id;
+    (*ctx->index)++;
+
+    r0 = st_mutex_unlock(ctx->lock);
+    ST_ASSERT_ERROR(r0 != 0, r0, "waiter failed to unlock mutex");
+
+    return NULL;
+}
+
+VOID TEST(LearnKB, MutexCooperativeWorkflow)
+{
+    st_mutex_t lock = st_mutex_new();
+    ASSERT_TRUE(lock != NULL);
+
+    int r0 = st_mutex_lock(lock);
+    ASSERT_EQ(0, r0);
+
+    // Same-owner re-lock should fail with EDEADLK.
+    errno = 0;
+    r0 = st_mutex_lock(lock);
+    EXPECT_EQ(-1, r0);
+    EXPECT_EQ(EDEADLK, errno);
+
+    int order[2] = {0, 0};
+    int index = 0;
+
+    MutexLearnCtx w1 = {lock, order, &index, 1};
+    MutexLearnCtx w2 = {lock, order, &index, 2};
+
+    st_thread_t t1 = st_thread_create(coroutine_mutex_fifo_waiter, &w1, 1, 0);
+    st_thread_t t2 = st_thread_create(coroutine_mutex_fifo_waiter, &w2, 1, 0);
+    ASSERT_TRUE(t1 && t2);
+
+    // Let both waiters run and block on mutex wait_q in FIFO order.
+    st_usleep(0);
+
+    // Owner unlock should hand off to the first waiter without preemption.
+    r0 = st_mutex_unlock(lock);
+    ASSERT_EQ(0, r0);
+
+    // Yield to allow waiter1 then waiter2 to run.
+    st_usleep(0);
+    st_usleep(0);
+
+    ST_COROUTINE_JOIN(t1, t1_err);
+    ST_COROUTINE_JOIN(t2, t2_err);
+    ST_EXPECT_SUCCESS(t1_err);
+    ST_EXPECT_SUCCESS(t2_err);
+
+    EXPECT_EQ(2, index);
+    EXPECT_EQ(1, order[0]);
+    EXPECT_EQ(2, order[1]);
+
+    r0 = st_mutex_destroy(lock);
+    EXPECT_EQ(0, r0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// The utest for epoll/kqueue driven I/O sleep/wakeup behavior.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct WakeCtx {
+    st_netfd_t reader;
+    st_netfd_t writer;
+    char value;
+};
+
+static void* delayed_writer(void* arg)
+{
+    WakeCtx* ctx = (WakeCtx*)arg;
+
+    // Give reader coroutine time to enter st_read() and block in st_poll().
+    st_usleep(10 * ST_UTIME_MILLISECONDS);
+
+    ssize_t n = st_write(ctx->writer, &ctx->value, 1, ST_UTEST_TIMEOUT);
+    ST_ASSERT_ERROR(n != 1, (int)n, "Writer failed");
+
+    return NULL;
+}
+
+static void* waiting_reader(void* arg)
+{
+    WakeCtx* ctx = (WakeCtx*)arg;
+
+    char ch = 0;
+    ssize_t n = st_read(ctx->reader, &ch, 1, ST_UTEST_TIMEOUT);
+    ST_ASSERT_ERROR(n != 1, (int)n, "Reader failed");
+    ST_ASSERT_ERROR(ch != ctx->value, (int)ch, "Unexpected byte");
+
+    return NULL;
+}
+
+VOID TEST(EpollWorkflowTest, ReaderSleepsAndWakesOnWriteReady)
+{
+    int fds[2] = {-1, -1};
+    int r0 = ::socketpair(AF_UNIX, SOCK_STREAM, 0, fds);
+    EXPECT_EQ(0, r0);
+
+    st_netfd_t reader = st_netfd_open_socket(fds[0]);
+    st_netfd_t writer = st_netfd_open_socket(fds[1]);
+    ASSERT_TRUE(reader != NULL);
+    ASSERT_TRUE(writer != NULL);
+
+    WakeCtx ctx;
+    ctx.reader = reader;
+    ctx.writer = writer;
+    ctx.value = 'S';
+
+    st_thread_t rd = st_thread_create(waiting_reader, &ctx, 1, 0);
+    st_thread_t wr = st_thread_create(delayed_writer, &ctx, 1, 0);
+
+    ASSERT_TRUE(rd != NULL);
+    ASSERT_TRUE(wr != NULL);
+
+    ST_COROUTINE_JOIN(rd, rd_err);
+    ST_COROUTINE_JOIN(wr, wr_err);
+
+    ST_EXPECT_SUCCESS(rd_err);
+    ST_EXPECT_SUCCESS(wr_err);
+
+    st_netfd_close(reader);
+    st_netfd_close(writer);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// The utest for st_cond condition variable workflow.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct CondLearnCtx {
+    st_cond_t cond;
+    int* wake_order;
+    int* wake_index;
+    int id;
+    st_utime_t timeout;
+};
+
+static void* coroutine_cond_wait_and_record(void* arg)
+{
+    CondLearnCtx* ctx = (CondLearnCtx*)arg;
+
+    int r0 = st_cond_timedwait(ctx->cond, ctx->timeout);
+    ST_ASSERT_ERROR(r0 != 0, r0, "cond wait expected success");
+
+    ctx->wake_order[*ctx->wake_index] = ctx->id;
+    (*ctx->wake_index)++;
+
+    return NULL;
+}
+
+static void* coroutine_cond_wait_timeout(void* arg)
+{
+    CondLearnCtx* ctx = (CondLearnCtx*)arg;
+
+    errno = 0;
+    int r0 = st_cond_timedwait(ctx->cond, ctx->timeout);
+    ST_ASSERT_ERROR(r0 != -1, r0, "cond wait should timeout");
+    ST_ASSERT_ERROR(errno != ETIME, errno, "cond wait should set ETIME");
+
+    return NULL;
+}
+
+VOID TEST(LearnKB, CondSignalWakeOne)
+{
+    st_cond_t cond = st_cond_new();
+    ASSERT_TRUE(cond != NULL);
+
+    int signal_order[2] = {0, 0};
+    int signal_index = 0;
+
+    CondLearnCtx s1 = {cond, signal_order, &signal_index, 1, ST_UTEST_TIMEOUT};
+    CondLearnCtx s2 = {cond, signal_order, &signal_index, 2, ST_UTEST_TIMEOUT};
+
+    st_thread_t ts1 = st_thread_create(coroutine_cond_wait_and_record, &s1, 1, 0);
+    st_thread_t ts2 = st_thread_create(coroutine_cond_wait_and_record, &s2, 1, 0);
+    ASSERT_TRUE(ts1 && ts2);
+
+    st_usleep(0);  // Let both waiters enter cond wait_q.
+
+    int r0 = st_cond_signal(cond);
+    ASSERT_EQ(0, r0);
+
+    st_usleep(0);  // Allow one waiter to resume.
+    EXPECT_EQ(1, signal_index);
+
+    // Wake remaining waiter to end the case cleanly.
+    r0 = st_cond_signal(cond);
+    ASSERT_EQ(0, r0);
+
+    ST_COROUTINE_JOIN(ts1, ts1_err);
+    ST_COROUTINE_JOIN(ts2, ts2_err);
+    ST_EXPECT_SUCCESS(ts1_err);
+    ST_EXPECT_SUCCESS(ts2_err);
+    EXPECT_EQ(2, signal_index);
+
+    r0 = st_cond_destroy(cond);
+    EXPECT_EQ(0, r0);
+}
+
+VOID TEST(LearnKB, CondBroadcastWakeAll)
+{
+    st_cond_t cond = st_cond_new();
+    ASSERT_TRUE(cond != NULL);
+
+    int broadcast_order[2] = {0, 0};
+    int broadcast_index = 0;
+
+    CondLearnCtx b1 = {cond, broadcast_order, &broadcast_index, 1, ST_UTEST_TIMEOUT};
+    CondLearnCtx b2 = {cond, broadcast_order, &broadcast_index, 2, ST_UTEST_TIMEOUT};
+
+    st_thread_t tb1 = st_thread_create(coroutine_cond_wait_and_record, &b1, 1, 0);
+    st_thread_t tb2 = st_thread_create(coroutine_cond_wait_and_record, &b2, 1, 0);
+    ASSERT_TRUE(tb1 && tb2);
+
+    st_usleep(0);  // Let both waiters enter cond wait_q.
+
+    int r0 = st_cond_broadcast(cond);
+    ASSERT_EQ(0, r0);
+
+    ST_COROUTINE_JOIN(tb1, tb1_err);
+    ST_COROUTINE_JOIN(tb2, tb2_err);
+    ST_EXPECT_SUCCESS(tb1_err);
+    ST_EXPECT_SUCCESS(tb2_err);
+    EXPECT_EQ(2, broadcast_index);
+
+    r0 = st_cond_destroy(cond);
+    EXPECT_EQ(0, r0);
+}
+
+VOID TEST(LearnKB, CondTimedwaitTimeout)
+{
+    st_cond_t cond = st_cond_new();
+    ASSERT_TRUE(cond != NULL);
+
+    CondLearnCtx t1 = {cond, NULL, NULL, 0, 10 * ST_UTIME_MILLISECONDS};
+    st_thread_t tt1 = st_thread_create(coroutine_cond_wait_timeout, &t1, 1, 0);
+    ASSERT_TRUE(tt1 != NULL);
+
+    ST_COROUTINE_JOIN(tt1, tt1_err);
+    ST_EXPECT_SUCCESS(tt1_err);
+
+    int r0 = st_cond_destroy(cond);
+    EXPECT_EQ(0, r0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// The utest for st_thread_exit coroutine termination workflow.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static void* coroutine_explicit_exit_with_retval(void* /*arg*/)
+{
+    st_thread_exit((void*)(intptr_t)123);
+    return NULL;
+}
+
+VOID TEST(LearnKB, ThreadExitExplicitRetvalThroughJoin)
+{
+    st_thread_t trd = st_thread_create(coroutine_explicit_exit_with_retval, NULL, 1, 0);
+    ASSERT_TRUE(trd != NULL);
+
+    void* retval = NULL;
+    int r0 = st_thread_join(trd, &retval);
+    ASSERT_EQ(0, r0);
+    EXPECT_EQ(123, (int)(intptr_t)retval);
+}
+
+static void* coroutine_nonjoinable_exit(void* arg)
+{
+    int* finished = (int*)arg;
+    *finished = 1;
+    return NULL;
+}
+
+VOID TEST(LearnKB, ThreadExitNonJoinableCannotJoin)
+{
+    int finished = 0;
+
+    st_thread_t trd = st_thread_create(coroutine_nonjoinable_exit, &finished, 0, 0);
+    ASSERT_TRUE(trd != NULL);
+
+    errno = 0;
+    int r0 = st_thread_join(trd, NULL);
+    EXPECT_EQ(-1, r0);
+    EXPECT_EQ(EINVAL, errno);
+
+    // Non-joinable thread still runs and exits on its own.
+    st_usleep(0);
+    EXPECT_EQ(1, finished);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// The utest for st_thread_interrupt wakeup workflow.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct InterruptSleepCtx {
+    volatile int ready;
+    int r0;
+    int err;
+};
+
+static void* coroutine_interruptible_sleep(void* arg)
+{
+    InterruptSleepCtx* ctx = (InterruptSleepCtx*)arg;
+
+    ctx->ready = 1;
+    errno = 0;
+    ctx->r0 = st_usleep(ST_UTEST_TIMEOUT);
+    ctx->err = errno;
+
+    return NULL;
+}
+
+VOID TEST(LearnKB, ThreadInterruptWakeupFromUsleep)
+{
+    InterruptSleepCtx ctx = {0, 0, 0};
+
+    st_thread_t trd = st_thread_create(coroutine_interruptible_sleep, &ctx, 1, 0);
+    ASSERT_TRUE(trd != NULL);
+
+    while (!ctx.ready) {
+        st_usleep(0);
+    }
+    st_usleep(0);  // Let target enter st_usleep wait state.
+
+    st_thread_interrupt(trd);
+
+    ST_COROUTINE_JOIN(trd, trd_err);
+    ST_EXPECT_SUCCESS(trd_err);
+    EXPECT_EQ(-1, ctx.r0);
+    EXPECT_EQ(EINTR, ctx.err);
+}
+
+struct InterruptCondCtx {
+    st_cond_t cond;
+    volatile int ready;
+    int r0;
+    int err;
+};
+
+static void* coroutine_interruptible_condwait(void* arg)
+{
+    InterruptCondCtx* ctx = (InterruptCondCtx*)arg;
+
+    ctx->ready = 1;
+    errno = 0;
+    ctx->r0 = st_cond_timedwait(ctx->cond, ST_UTIME_NO_TIMEOUT);
+    ctx->err = errno;
+
+    return NULL;
+}
+
+VOID TEST(LearnKB, ThreadInterruptWakeupFromCondWait)
+{
+    st_cond_t cond = st_cond_new();
+    ASSERT_TRUE(cond != NULL);
+
+    InterruptCondCtx ctx = {cond, 0, 0, 0};
+    st_thread_t trd = st_thread_create(coroutine_interruptible_condwait, &ctx, 1, 0);
+    ASSERT_TRUE(trd != NULL);
+
+    while (!ctx.ready) {
+        st_usleep(0);
+    }
+    st_usleep(0);  // Let target enter cond wait_q.
+
+    st_thread_interrupt(trd);
+
+    ST_COROUTINE_JOIN(trd, trd_err);
+    ST_EXPECT_SUCCESS(trd_err);
+    EXPECT_EQ(-1, ctx.r0);
+    EXPECT_EQ(EINTR, ctx.err);
+
+    int r0 = st_cond_destroy(cond);
+    EXPECT_EQ(0, r0);
+}
+
+struct InterruptMutexCtx {
+    st_mutex_t lock;
+    volatile int ready;
+    int r0;
+    int err;
+};
+
+static void* coroutine_interruptible_mutex_lock(void* arg)
+{
+    InterruptMutexCtx* ctx = (InterruptMutexCtx*)arg;
+
+    ctx->ready = 1;
+    errno = 0;
+    ctx->r0 = st_mutex_lock(ctx->lock);
+    ctx->err = errno;
+
+    return NULL;
+}
+
+VOID TEST(LearnKB, ThreadInterruptWakeupFromMutexWait)
+{
+    st_mutex_t lock = st_mutex_new();
+    ASSERT_TRUE(lock != NULL);
+
+    int r0 = st_mutex_lock(lock);
+    ASSERT_EQ(0, r0);
+
+    InterruptMutexCtx ctx = {lock, 0, 0, 0};
+    st_thread_t trd = st_thread_create(coroutine_interruptible_mutex_lock, &ctx, 1, 0);
+    ASSERT_TRUE(trd != NULL);
+
+    while (!ctx.ready) {
+        st_usleep(0);
+    }
+    st_usleep(0);  // Let target block in mutex wait_q.
+
+    st_thread_interrupt(trd);
+
+    ST_COROUTINE_JOIN(trd, trd_err);
+    ST_EXPECT_SUCCESS(trd_err);
+    EXPECT_EQ(-1, ctx.r0);
+    EXPECT_EQ(EINTR, ctx.err);
+
+    r0 = st_mutex_unlock(lock);
+    EXPECT_EQ(0, r0);
+    r0 = st_mutex_destroy(lock);
+    EXPECT_EQ(0, r0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// The utest for _st_netfd_t abstraction workflow.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+static int g_netfd_destructor_calls = 0;
+
+static void netfd_specific_destructor(void* arg)
+{
+    g_netfd_destructor_calls++;
+    free(arg);
+}
+
+VOID TEST(LearnKB, NetfdSpecificAndDestructorOnClose)
+{
+    int fds[2] = {-1, -1};
+    int r0 = ::socketpair(AF_UNIX, SOCK_STREAM, 0, fds);
+    ASSERT_EQ(0, r0);
+
+    st_netfd_t stfd = st_netfd_open_socket(fds[0]);
+    ASSERT_TRUE(stfd != NULL);
+
+    // st_netfd_open_socket takes ownership of fds[0], keep peer as raw fd.
+    int peer = fds[1];
+
+    g_netfd_destructor_calls = 0;
+    int* payload = (int*)malloc(sizeof(int));
+    ASSERT_TRUE(payload != NULL);
+    *payload = 2026;
+
+    st_netfd_setspecific(stfd, payload, netfd_specific_destructor);
+    void* got = st_netfd_getspecific(stfd);
+    EXPECT_EQ(payload, got);
+
+    r0 = st_netfd_close(stfd);
+    EXPECT_EQ(0, r0);
+    EXPECT_EQ(1, g_netfd_destructor_calls);
+
+    ::close(peer);
+}
+
+VOID TEST(LearnKB, NetfdFreeKeepsOsfdOpen)
+{
+    int fds[2] = {-1, -1};
+    int r0 = ::socketpair(AF_UNIX, SOCK_STREAM, 0, fds);
+    ASSERT_EQ(0, r0);
+
+    st_netfd_t stfd = st_netfd_open_socket(fds[0]);
+    ASSERT_TRUE(stfd != NULL);
+
+    int osfd = fds[0];
+    int peer = fds[1];
+
+    g_netfd_destructor_calls = 0;
+    int* payload = (int*)malloc(sizeof(int));
+    ASSERT_TRUE(payload != NULL);
+    *payload = 7;
+
+    st_netfd_setspecific(stfd, payload, netfd_specific_destructor);
+
+    // Free wrapper only: should trigger destructor but keep underlying osfd open.
+    st_netfd_free(stfd);
+    EXPECT_EQ(1, g_netfd_destructor_calls);
+
+    errno = 0;
+    int flags = fcntl(osfd, F_GETFD);
+    EXPECT_NE(-1, flags);
+
+    // Raw fd should still be usable.
+    char ch = 'N';
+    ssize_t n = ::write(peer, &ch, 1);
+    EXPECT_EQ(1, n);
+
+    char got = 0;
+    n = ::read(osfd, &got, 1);
+    EXPECT_EQ(1, n);
+    EXPECT_EQ('N', got);
+
+    ::close(osfd);
+    ::close(peer);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// The utest for basic ST netfd read/write workflow.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+VOID TEST(LearnKB, BasicNetfdWriteThenRead)
+{
+    int fds[2] = {-1, -1};
+    int r0 = ::socketpair(AF_UNIX, SOCK_STREAM, 0, fds);
+    ASSERT_EQ(0, r0);
+
+    st_netfd_t reader = st_netfd_open_socket(fds[0]);
+    st_netfd_t writer = st_netfd_open_socket(fds[1]);
+    ASSERT_TRUE(reader != NULL);
+    ASSERT_TRUE(writer != NULL);
+
+    const char* msg = "hello-st";
+    ssize_t wn = st_write(writer, msg, 8, ST_UTEST_TIMEOUT);
+    ASSERT_EQ(8, wn);
+
+    char buf[16] = {0};
+    ssize_t rn = st_read(reader, buf, 8, ST_UTEST_TIMEOUT);
+    ASSERT_EQ(8, rn);
+    EXPECT_STREQ("hello-st", buf);
+
+    r0 = st_netfd_close(reader);
+    EXPECT_EQ(0, r0);
+    r0 = st_netfd_close(writer);
+    EXPECT_EQ(0, r0);
+}
+
+VOID TEST(LearnKB, BasicNetfdReadTimeout)
+{
+    int fds[2] = {-1, -1};
+    int r0 = ::socketpair(AF_UNIX, SOCK_STREAM, 0, fds);
+    ASSERT_EQ(0, r0);
+
+    st_netfd_t reader = st_netfd_open_socket(fds[0]);
+    st_netfd_t writer = st_netfd_open_socket(fds[1]);
+    ASSERT_TRUE(reader != NULL);
+    ASSERT_TRUE(writer != NULL);
+
+    char ch = 0;
+    errno = 0;
+    ssize_t rn = st_read(reader, &ch, 1, 10 * ST_UTIME_MILLISECONDS);
+    EXPECT_EQ(-1, rn);
+    EXPECT_EQ(ETIME, errno);
+
+    r0 = st_netfd_close(reader);
+    EXPECT_EQ(0, r0);
+    r0 = st_netfd_close(writer);
+    EXPECT_EQ(0, r0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// The utest for _st_eventsys_t abstraction behavior (selected backend, immutable after st_init).
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+VOID TEST(LearnKB, EventSysSelectedAndLockedAfterInit)
+{
+    // st_utest main already called st_set_eventsys(...) then st_init().
+    // Here we verify what this abstraction guarantees at runtime.
+    int active = st_get_eventsys();
+    const char* name = st_get_eventsys_name();
+
+    EXPECT_TRUE(active == ST_EVENTSYS_SELECT || active == ST_EVENTSYS_ALT);
+    EXPECT_TRUE(name != NULL);
+    EXPECT_GT((int)strlen(name), 0);
+
+    // Once selected, eventsys cannot be changed again.
+    errno = 0;
+    int r0 = st_set_eventsys(ST_EVENTSYS_SELECT);
+    EXPECT_EQ(-1, r0);
+    EXPECT_EQ(EBUSY, errno);
+
+    errno = 0;
+    r0 = st_set_eventsys(ST_EVENTSYS_ALT);
+    EXPECT_EQ(-1, r0);
+    EXPECT_EQ(EBUSY, errno);
+}
diff --git a/trunk/3rdparty/st-srs/utest/st_utest_tcp.cpp b/trunk/3rdparty/st-srs/utest/st_utest_tcp.cpp
index aef3a050c..a20487266 100644
--- a/trunk/3rdparty/st-srs/utest/st_utest_tcp.cpp
+++ b/trunk/3rdparty/st-srs/utest/st_utest_tcp.cpp
@@ -11,7 +11,8 @@
 #include <arpa/inet.h>
 
 #define ST_UTEST_PORT 26878
-#define ST_UTEST_TIMEOUT (100 * SRS_UTIME_MILLISECONDS)
+#define ST_UTIME_MILLISECONDS 1000
+#define ST_UTEST_TIMEOUT (100 * ST_UTIME_MILLISECONDS)
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // The utest for ping-pong TCP server coroutine.
diff --git a/trunk/doc/CHANGELOG.md b/trunk/doc/CHANGELOG.md
index acc89b854..236e74f4c 100644
--- a/trunk/doc/CHANGELOG.md
+++ b/trunk/doc/CHANGELOG.md
@@ -7,6 +7,7 @@ The changelog for SRS.
 <a name="v7-changes"></a>
 
 ## SRS 7.0 Changelog
+* v7.0, 2026-03-05, Merge [#4643](https://github.com/ossrs/srs/pull/4643): OpenClaw: add and refine ST knowledge-base and learning/review skills. v7.0.138 (#4643)
 * v7.0, 2025-12-31, Merge [#4618](https://github.com/ossrs/srs/pull/4618): HLS/DASH: Fix dispose() to cleanup files even after on_unpublish() sets enabled_ to false. v7.0.137 (#4618)
 * v7.0, 2025-12-07, Merge [#4602](https://github.com/ossrs/srs/pull/4602): HLS: Fix audio-only fMP4 playback skipping. v7.0.136 (#4602)
 * v7.0, 2025-12-06, Merge [#4604](https://github.com/ossrs/srs/pull/4604): DVR: Fix HEVC mp4 recording error. v7.0.135 (#4604)