MP4 DVR: Fix audio/video synchronization issues in WebRTC recordings. v6.0.172 v7.0.52 (#4230)

Fixes #3993 - WebRTC streams recorded to MP4 via DVR exhibit audio/video
synchronization issues, with audio typically ahead of video. **Note:
This issue is specific to MP4 format; FLV recordings are not affected.**

When WebRTC streams are converted to RTMP and then muxed to MP4, the
audio and video tracks may start at different timestamps. The MP4 muxer
was not accounting for this timing offset between the first audio and
video samples in the STTS (Sample Time-to-Sample) table, causing the
tracks to be misaligned in the final MP4 file.

Introduces `SrsMp4DvrJitter` class specifically for MP4 audio/video
synchronization:

- **Timestamp Tracking**: Records the DTS of the first audio and video
samples
- **Offset Calculation**: Computes the timing difference between track
start times
- **MP4 STTS Correction**: Sets appropriate `sample_delta` values in the
MP4 STTS table to maintain proper A/V sync

- Added `SrsMp4DvrJitter` class in `srs_kernel_mp4.hpp/cpp`
- Integrated jitter correction into `SrsMp4SampleManager::write_track()`
for MP4 format only
- Added comprehensive unit tests covering various timing scenarios
- **Scope**: Changes are isolated to MP4 kernel code and do not affect
FLV processing

This fix ensures that MP4 DVR recordings from WebRTC streams maintain
proper audio/video synchronization regardless of the relative timing of
the first audio and video frames, while leaving FLV format processing
unchanged.

---------

Co-authored-by: Haibo Chen <495810242@qq.com>
Co-authored-by: john <hondaxiao@tencent.com>
Co-authored-by: winlin <winlinvip@gmail.com>
Co-authored-by: OSSRS-AI <winlinam@gmail.com>
This commit is contained in:
Jacob Su 2025-08-12 21:50:08 +08:00 committed by winlin
parent db5e43967c
commit 30ea67f5f2
6 changed files with 226 additions and 2 deletions

View File

@ -7,6 +7,7 @@ The changelog for SRS.
<a name="v7-changes"></a>
## SRS 7.0 Changelog
* v7.0, 2025-08-12, Merge [#4230](https://github.com/ossrs/srs/pull/4230): MP4 DVR: Fix audio/video synchronization issues in WebRTC recordings. v7.0.52 (#4230)
* v7.0, 2025-08-12, Merge [#4301](https://github.com/ossrs/srs/pull/4301): Valgrind: Return error for unsupported check=new on Valgrind < 3.21. v7.0.52 (#4301)
* v7.0, 2025-08-12, Merge [#4431](https://github.com/ossrs/srs/pull/4431): fix srt cmake 4.x compiling error. v7.0.52 (#4431)
* v7.0, 2025-08-11, Merge [#4433](https://github.com/ossrs/srs/pull/4433): Use clang format. v7.0.52 (#4433)
@ -65,6 +66,7 @@ The changelog for SRS.
<a name="v6-changes"></a>
## SRS 6.0 Changelog
* v6.0, 2025-08-12, Merge [#4230](https://github.com/ossrs/srs/pull/4230): MP4 DVR: Fix audio/video synchronization issues in WebRTC recordings. v6.0.172 (#4230)
* v6.0, 2025-08-11, Merge [#4432](https://github.com/ossrs/srs/pull/4432): AI: HTTP-FLV: Fix heap-use-after-free crash during stream unmount. v6.0.171 (#4432)
* v6.0, 2025-07-28, Merge [#4245](https://github.com/ossrs/srs/pull/4245): Allow Forward to be configured with Env Var. v6.0.170 (#4245)
* v6.0, 2025-07-10, Merge [#4414](https://github.com/ossrs/srs/pull/4414): Fix H.264 B-frame detection logic to comply with specification. v6.0.169 (#4414)

View File

@ -9,6 +9,6 @@
#define VERSION_MAJOR 6
#define VERSION_MINOR 0
#define VERSION_REVISION 171
#define VERSION_REVISION 172
#endif

View File

@ -9,6 +9,6 @@
#define VERSION_MAJOR 7
#define VERSION_MINOR 0
#define VERSION_REVISION 51
#define VERSION_REVISION 52
#endif

View File

@ -5549,12 +5549,60 @@ uint32_t SrsMp4Sample::pts_ms()
return (uint32_t)(pts * 1000 / tbn) + adjust;
}
SrsMp4DvrJitter::SrsMp4DvrJitter()
{
reset();
}
SrsMp4DvrJitter::~SrsMp4DvrJitter()
{
}
void SrsMp4DvrJitter::on_sample(SrsMp4Sample *sample)
{
if (!has_first_audio_ && sample->type == SrsFrameTypeAudio) {
has_first_audio_ = true;
audio_start_dts_ = sample->dts;
}
if (!has_first_video_ && sample->type == SrsFrameTypeVideo) {
has_first_video_ = true;
video_start_dts_ = sample->dts;
}
}
uint32_t SrsMp4DvrJitter::get_first_sample_delta(SrsFrameType track)
{
if (track == SrsFrameTypeVideo) {
return video_start_dts_ > audio_start_dts_ ? video_start_dts_ - audio_start_dts_ : 0;
} else if (track == SrsFrameTypeAudio) {
return audio_start_dts_ > video_start_dts_ ? audio_start_dts_ - video_start_dts_ : 0;
}
return 0;
}
void SrsMp4DvrJitter::reset()
{
video_start_dts_ = 0;
audio_start_dts_ = 0;
has_first_video_ = false;
has_first_audio_ = false;
}
bool SrsMp4DvrJitter::is_initialized()
{
return has_first_video_ && has_first_audio_;
}
SrsMp4SampleManager::SrsMp4SampleManager()
{
jitter_ = new SrsMp4DvrJitter();
}
SrsMp4SampleManager::~SrsMp4SampleManager()
{
srs_freep(jitter_);
vector<SrsMp4Sample *>::iterator it;
for (it = samples.begin(); it != samples.end(); ++it) {
SrsMp4Sample *sample = *it;
@ -5631,6 +5679,7 @@ SrsMp4Sample *SrsMp4SampleManager::at(uint32_t index)
void SrsMp4SampleManager::append(SrsMp4Sample *sample)
{
jitter_->on_sample(sample);
samples.push_back(sample);
}
@ -5805,6 +5854,7 @@ srs_error_t SrsMp4SampleManager::write_track(SrsFrameType track,
} else {
// The first sample always in the STTS table.
stts_entry.sample_count++;
stts_entry.sample_delta = jitter_->get_first_sample_delta(track);
}
}

View File

@ -2434,6 +2434,34 @@ public:
virtual uint32_t pts_ms();
};
// MP4 DVR jitter for audio/video synchronization in DVR recordings.
// Handles timing offset between audio and video tracks to ensure proper A/V sync in MP4 files.
class SrsMp4DvrJitter
{
private:
uint64_t video_start_dts_;
uint64_t audio_start_dts_;
bool has_first_video_;
bool has_first_audio_;
public:
SrsMp4DvrJitter();
virtual ~SrsMp4DvrJitter();
public:
// Record the first sample timestamp for each track type
virtual void on_sample(SrsMp4Sample *sample);
// Calculate the initial STTS delta for the first sample of a track
// to maintain A/V synchronization in MP4 files
virtual uint32_t get_first_sample_delta(SrsFrameType track);
private:
// Reset the jitter state (useful for new recording sessions)
virtual void reset();
// Check if both audio and video start times have been captured
virtual bool is_initialized();
};
// Build samples from moov, or write samples to moov.
// One or more sample are grouped to a chunk, each track contains one or more chunks.
// The offset of chunk is specified by stco.
@ -2445,6 +2473,9 @@ public:
// The keyframe is specified by stss.
class SrsMp4SampleManager
{
private:
SrsMp4DvrJitter *jitter_; // MP4 A/V sync jitter handler
public:
std::vector<SrsMp4Sample *> samples;

View File

@ -2454,3 +2454,144 @@ VOID TEST(KernelMp4Test, SrsMp4M2tsInitEncoder)
EXPECT_TRUE(fw.filesize() > 0);
}
}
VOID TEST(KernelMp4Test, SrsMp4DvrJitter)
{
// Test basic initialization
if (true) {
SrsMp4DvrJitter jitter;
// Should not be initialized yet
EXPECT_FALSE(jitter.is_initialized());
// Delta should be 0 for uninitialized jitter
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeVideo));
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio));
}
// Test audio first scenario
if (true) {
SrsMp4DvrJitter jitter;
// Create audio sample that arrives first
SrsMp4Sample audio_sample;
audio_sample.type = SrsFrameTypeAudio;
audio_sample.dts = 1000; // Audio starts at 1000us
// Create video sample that arrives later
SrsMp4Sample video_sample;
video_sample.type = SrsFrameTypeVideo;
video_sample.dts = 2000; // Video starts at 2000us
// Process samples
jitter.on_sample(&audio_sample);
jitter.on_sample(&video_sample);
// Should be initialized now
EXPECT_TRUE(jitter.is_initialized());
// Video should have delta = video_start - audio_start = 2000 - 1000 = 1000
EXPECT_EQ(1000, jitter.get_first_sample_delta(SrsFrameTypeVideo));
// Audio should have delta = 0 (since audio started first)
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio));
}
// Test video first scenario
if (true) {
SrsMp4DvrJitter jitter;
// Create video sample that arrives first
SrsMp4Sample video_sample;
video_sample.type = SrsFrameTypeVideo;
video_sample.dts = 500; // Video starts at 500us
// Create audio sample that arrives later
SrsMp4Sample audio_sample;
audio_sample.type = SrsFrameTypeAudio;
audio_sample.dts = 1500; // Audio starts at 1500us
// Process samples
jitter.on_sample(&video_sample);
jitter.on_sample(&audio_sample);
// Should be initialized now
EXPECT_TRUE(jitter.is_initialized());
// Audio should have delta = audio_start - video_start = 1500 - 500 = 1000
EXPECT_EQ(1000, jitter.get_first_sample_delta(SrsFrameTypeAudio));
// Video should have delta = 0 (since video started first)
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeVideo));
}
// Test same start time scenario
if (true) {
SrsMp4DvrJitter jitter;
// Create samples with same start time
SrsMp4Sample audio_sample;
audio_sample.type = SrsFrameTypeAudio;
audio_sample.dts = 1000;
SrsMp4Sample video_sample;
video_sample.type = SrsFrameTypeVideo;
video_sample.dts = 1000;
// Process samples
jitter.on_sample(&audio_sample);
jitter.on_sample(&video_sample);
// Should be initialized now
EXPECT_TRUE(jitter.is_initialized());
// Both should have delta = 0 (same start time)
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeVideo));
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio));
}
// Test reset functionality
if (true) {
SrsMp4DvrJitter jitter;
// Initialize with samples
SrsMp4Sample audio_sample;
audio_sample.type = SrsFrameTypeAudio;
audio_sample.dts = 1000;
jitter.on_sample(&audio_sample);
// Reset and verify
jitter.reset();
EXPECT_FALSE(jitter.is_initialized());
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeVideo));
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio));
}
// Test multiple samples of same type (should only record first)
if (true) {
SrsMp4DvrJitter jitter;
// Create multiple audio samples
SrsMp4Sample audio1;
audio1.type = SrsFrameTypeAudio;
audio1.dts = 1000;
SrsMp4Sample audio2;
audio2.type = SrsFrameTypeAudio;
audio2.dts = 2000; // This should be ignored
SrsMp4Sample video1;
video1.type = SrsFrameTypeVideo;
video1.dts = 1500;
// Process samples
jitter.on_sample(&audio1);
jitter.on_sample(&audio2); // Should be ignored
jitter.on_sample(&video1);
// Should use first audio sample (1000) not second (2000)
EXPECT_EQ(500, jitter.get_first_sample_delta(SrsFrameTypeVideo)); // 1500 - 1000 = 500
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio));
}
}