From 30ea67f5f2e7d0d967e2f27dbe51d2a628db7251 Mon Sep 17 00:00:00 2001 From: Jacob Su Date: Tue, 12 Aug 2025 21:50:08 +0800 Subject: [PATCH] MP4 DVR: Fix audio/video synchronization issues in WebRTC recordings. v6.0.172 v7.0.52 (#4230) Fixes #3993 - WebRTC streams recorded to MP4 via DVR exhibit audio/video synchronization issues, with audio typically ahead of video. **Note: This issue is specific to MP4 format; FLV recordings are not affected.** When WebRTC streams are converted to RTMP and then muxed to MP4, the audio and video tracks may start at different timestamps. The MP4 muxer was not accounting for this timing offset between the first audio and video samples in the STTS (Sample Time-to-Sample) table, causing the tracks to be misaligned in the final MP4 file. Introduces `SrsMp4DvrJitter` class specifically for MP4 audio/video synchronization: - **Timestamp Tracking**: Records the DTS of the first audio and video samples - **Offset Calculation**: Computes the timing difference between track start times - **MP4 STTS Correction**: Sets appropriate `sample_delta` values in the MP4 STTS table to maintain proper A/V sync - Added `SrsMp4DvrJitter` class in `srs_kernel_mp4.hpp/cpp` - Integrated jitter correction into `SrsMp4SampleManager::write_track()` for MP4 format only - Added comprehensive unit tests covering various timing scenarios - **Scope**: Changes are isolated to MP4 kernel code and do not affect FLV processing This fix ensures that MP4 DVR recordings from WebRTC streams maintain proper audio/video synchronization regardless of the relative timing of the first audio and video frames, while leaving FLV format processing unchanged. --------- Co-authored-by: Haibo Chen <495810242@qq.com> Co-authored-by: john Co-authored-by: winlin Co-authored-by: OSSRS-AI --- trunk/doc/CHANGELOG.md | 2 + trunk/src/core/srs_core_version6.hpp | 2 +- trunk/src/core/srs_core_version7.hpp | 2 +- trunk/src/kernel/srs_kernel_mp4.cpp | 50 ++++++++++ trunk/src/kernel/srs_kernel_mp4.hpp | 31 ++++++ trunk/src/utest/srs_utest_mp4.cpp | 141 +++++++++++++++++++++++++++ 6 files changed, 226 insertions(+), 2 deletions(-) diff --git a/trunk/doc/CHANGELOG.md b/trunk/doc/CHANGELOG.md index 36e815e91..97de3a803 100644 --- a/trunk/doc/CHANGELOG.md +++ b/trunk/doc/CHANGELOG.md @@ -7,6 +7,7 @@ The changelog for SRS. ## SRS 7.0 Changelog +* v7.0, 2025-08-12, Merge [#4230](https://github.com/ossrs/srs/pull/4230): MP4 DVR: Fix audio/video synchronization issues in WebRTC recordings. v7.0.52 (#4230) * v7.0, 2025-08-12, Merge [#4301](https://github.com/ossrs/srs/pull/4301): Valgrind: Return error for unsupported check=new on Valgrind < 3.21. v7.0.52 (#4301) * v7.0, 2025-08-12, Merge [#4431](https://github.com/ossrs/srs/pull/4431): fix srt cmake 4.x compiling error. v7.0.52 (#4431) * v7.0, 2025-08-11, Merge [#4433](https://github.com/ossrs/srs/pull/4433): Use clang format. v7.0.52 (#4433) @@ -65,6 +66,7 @@ The changelog for SRS. ## SRS 6.0 Changelog +* v6.0, 2025-08-12, Merge [#4230](https://github.com/ossrs/srs/pull/4230): MP4 DVR: Fix audio/video synchronization issues in WebRTC recordings. v6.0.172 (#4230) * v6.0, 2025-08-11, Merge [#4432](https://github.com/ossrs/srs/pull/4432): AI: HTTP-FLV: Fix heap-use-after-free crash during stream unmount. v6.0.171 (#4432) * v6.0, 2025-07-28, Merge [#4245](https://github.com/ossrs/srs/pull/4245): Allow Forward to be configured with Env Var. v6.0.170 (#4245) * v6.0, 2025-07-10, Merge [#4414](https://github.com/ossrs/srs/pull/4414): Fix H.264 B-frame detection logic to comply with specification. v6.0.169 (#4414) diff --git a/trunk/src/core/srs_core_version6.hpp b/trunk/src/core/srs_core_version6.hpp index af5bdf5b1..89a09a264 100644 --- a/trunk/src/core/srs_core_version6.hpp +++ b/trunk/src/core/srs_core_version6.hpp @@ -9,6 +9,6 @@ #define VERSION_MAJOR 6 #define VERSION_MINOR 0 -#define VERSION_REVISION 171 +#define VERSION_REVISION 172 #endif diff --git a/trunk/src/core/srs_core_version7.hpp b/trunk/src/core/srs_core_version7.hpp index 60b16270f..21fd136c6 100644 --- a/trunk/src/core/srs_core_version7.hpp +++ b/trunk/src/core/srs_core_version7.hpp @@ -9,6 +9,6 @@ #define VERSION_MAJOR 7 #define VERSION_MINOR 0 -#define VERSION_REVISION 51 +#define VERSION_REVISION 52 #endif \ No newline at end of file diff --git a/trunk/src/kernel/srs_kernel_mp4.cpp b/trunk/src/kernel/srs_kernel_mp4.cpp index 10b648b7b..d52b22fcf 100644 --- a/trunk/src/kernel/srs_kernel_mp4.cpp +++ b/trunk/src/kernel/srs_kernel_mp4.cpp @@ -5549,12 +5549,60 @@ uint32_t SrsMp4Sample::pts_ms() return (uint32_t)(pts * 1000 / tbn) + adjust; } +SrsMp4DvrJitter::SrsMp4DvrJitter() +{ + reset(); +} + +SrsMp4DvrJitter::~SrsMp4DvrJitter() +{ +} + +void SrsMp4DvrJitter::on_sample(SrsMp4Sample *sample) +{ + if (!has_first_audio_ && sample->type == SrsFrameTypeAudio) { + has_first_audio_ = true; + audio_start_dts_ = sample->dts; + } + + if (!has_first_video_ && sample->type == SrsFrameTypeVideo) { + has_first_video_ = true; + video_start_dts_ = sample->dts; + } +} + +uint32_t SrsMp4DvrJitter::get_first_sample_delta(SrsFrameType track) +{ + if (track == SrsFrameTypeVideo) { + return video_start_dts_ > audio_start_dts_ ? video_start_dts_ - audio_start_dts_ : 0; + } else if (track == SrsFrameTypeAudio) { + return audio_start_dts_ > video_start_dts_ ? audio_start_dts_ - video_start_dts_ : 0; + } + return 0; +} + +void SrsMp4DvrJitter::reset() +{ + video_start_dts_ = 0; + audio_start_dts_ = 0; + has_first_video_ = false; + has_first_audio_ = false; +} + +bool SrsMp4DvrJitter::is_initialized() +{ + return has_first_video_ && has_first_audio_; +} + SrsMp4SampleManager::SrsMp4SampleManager() { + jitter_ = new SrsMp4DvrJitter(); } SrsMp4SampleManager::~SrsMp4SampleManager() { + srs_freep(jitter_); + vector::iterator it; for (it = samples.begin(); it != samples.end(); ++it) { SrsMp4Sample *sample = *it; @@ -5631,6 +5679,7 @@ SrsMp4Sample *SrsMp4SampleManager::at(uint32_t index) void SrsMp4SampleManager::append(SrsMp4Sample *sample) { + jitter_->on_sample(sample); samples.push_back(sample); } @@ -5805,6 +5854,7 @@ srs_error_t SrsMp4SampleManager::write_track(SrsFrameType track, } else { // The first sample always in the STTS table. stts_entry.sample_count++; + stts_entry.sample_delta = jitter_->get_first_sample_delta(track); } } diff --git a/trunk/src/kernel/srs_kernel_mp4.hpp b/trunk/src/kernel/srs_kernel_mp4.hpp index 8df73ef55..b1c903ae9 100644 --- a/trunk/src/kernel/srs_kernel_mp4.hpp +++ b/trunk/src/kernel/srs_kernel_mp4.hpp @@ -2434,6 +2434,34 @@ public: virtual uint32_t pts_ms(); }; +// MP4 DVR jitter for audio/video synchronization in DVR recordings. +// Handles timing offset between audio and video tracks to ensure proper A/V sync in MP4 files. +class SrsMp4DvrJitter +{ +private: + uint64_t video_start_dts_; + uint64_t audio_start_dts_; + bool has_first_video_; + bool has_first_audio_; + +public: + SrsMp4DvrJitter(); + virtual ~SrsMp4DvrJitter(); + +public: + // Record the first sample timestamp for each track type + virtual void on_sample(SrsMp4Sample *sample); + // Calculate the initial STTS delta for the first sample of a track + // to maintain A/V synchronization in MP4 files + virtual uint32_t get_first_sample_delta(SrsFrameType track); + +private: + // Reset the jitter state (useful for new recording sessions) + virtual void reset(); + // Check if both audio and video start times have been captured + virtual bool is_initialized(); +}; + // Build samples from moov, or write samples to moov. // One or more sample are grouped to a chunk, each track contains one or more chunks. // The offset of chunk is specified by stco. @@ -2445,6 +2473,9 @@ public: // The keyframe is specified by stss. class SrsMp4SampleManager { +private: + SrsMp4DvrJitter *jitter_; // MP4 A/V sync jitter handler + public: std::vector samples; diff --git a/trunk/src/utest/srs_utest_mp4.cpp b/trunk/src/utest/srs_utest_mp4.cpp index 76cf7d18f..e167c696f 100644 --- a/trunk/src/utest/srs_utest_mp4.cpp +++ b/trunk/src/utest/srs_utest_mp4.cpp @@ -2454,3 +2454,144 @@ VOID TEST(KernelMp4Test, SrsMp4M2tsInitEncoder) EXPECT_TRUE(fw.filesize() > 0); } } + +VOID TEST(KernelMp4Test, SrsMp4DvrJitter) +{ + // Test basic initialization + if (true) { + SrsMp4DvrJitter jitter; + + // Should not be initialized yet + EXPECT_FALSE(jitter.is_initialized()); + + // Delta should be 0 for uninitialized jitter + EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeVideo)); + EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio)); + } + + // Test audio first scenario + if (true) { + SrsMp4DvrJitter jitter; + + // Create audio sample that arrives first + SrsMp4Sample audio_sample; + audio_sample.type = SrsFrameTypeAudio; + audio_sample.dts = 1000; // Audio starts at 1000us + + // Create video sample that arrives later + SrsMp4Sample video_sample; + video_sample.type = SrsFrameTypeVideo; + video_sample.dts = 2000; // Video starts at 2000us + + // Process samples + jitter.on_sample(&audio_sample); + jitter.on_sample(&video_sample); + + // Should be initialized now + EXPECT_TRUE(jitter.is_initialized()); + + // Video should have delta = video_start - audio_start = 2000 - 1000 = 1000 + EXPECT_EQ(1000, jitter.get_first_sample_delta(SrsFrameTypeVideo)); + + // Audio should have delta = 0 (since audio started first) + EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio)); + } + + // Test video first scenario + if (true) { + SrsMp4DvrJitter jitter; + + // Create video sample that arrives first + SrsMp4Sample video_sample; + video_sample.type = SrsFrameTypeVideo; + video_sample.dts = 500; // Video starts at 500us + + // Create audio sample that arrives later + SrsMp4Sample audio_sample; + audio_sample.type = SrsFrameTypeAudio; + audio_sample.dts = 1500; // Audio starts at 1500us + + // Process samples + jitter.on_sample(&video_sample); + jitter.on_sample(&audio_sample); + + // Should be initialized now + EXPECT_TRUE(jitter.is_initialized()); + + // Audio should have delta = audio_start - video_start = 1500 - 500 = 1000 + EXPECT_EQ(1000, jitter.get_first_sample_delta(SrsFrameTypeAudio)); + + // Video should have delta = 0 (since video started first) + EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeVideo)); + } + + // Test same start time scenario + if (true) { + SrsMp4DvrJitter jitter; + + // Create samples with same start time + SrsMp4Sample audio_sample; + audio_sample.type = SrsFrameTypeAudio; + audio_sample.dts = 1000; + + SrsMp4Sample video_sample; + video_sample.type = SrsFrameTypeVideo; + video_sample.dts = 1000; + + // Process samples + jitter.on_sample(&audio_sample); + jitter.on_sample(&video_sample); + + // Should be initialized now + EXPECT_TRUE(jitter.is_initialized()); + + // Both should have delta = 0 (same start time) + EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeVideo)); + EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio)); + } + + // Test reset functionality + if (true) { + SrsMp4DvrJitter jitter; + + // Initialize with samples + SrsMp4Sample audio_sample; + audio_sample.type = SrsFrameTypeAudio; + audio_sample.dts = 1000; + + jitter.on_sample(&audio_sample); + + // Reset and verify + jitter.reset(); + EXPECT_FALSE(jitter.is_initialized()); + EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeVideo)); + EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio)); + } + + // Test multiple samples of same type (should only record first) + if (true) { + SrsMp4DvrJitter jitter; + + // Create multiple audio samples + SrsMp4Sample audio1; + audio1.type = SrsFrameTypeAudio; + audio1.dts = 1000; + + SrsMp4Sample audio2; + audio2.type = SrsFrameTypeAudio; + audio2.dts = 2000; // This should be ignored + + SrsMp4Sample video1; + video1.type = SrsFrameTypeVideo; + video1.dts = 1500; + + // Process samples + jitter.on_sample(&audio1); + jitter.on_sample(&audio2); // Should be ignored + jitter.on_sample(&video1); + + // Should use first audio sample (1000) not second (2000) + EXPECT_EQ(500, jitter.get_first_sample_delta(SrsFrameTypeVideo)); // 1500 - 1000 = 500 + EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio)); + } +}