Skip to content

Commit 8e57364

Browse files
suzp1984duiniuluantanqinxiaozhihongwinlinvipossrs-ai
committed
MP4 DVR: Fix audio/video synchronization issues in WebRTC recordings. v6.0.172 (#4230)
Fixes #3993 - WebRTC streams recorded to MP4 via DVR exhibit audio/video synchronization issues, with audio typically ahead of video. **Note: This issue is specific to MP4 format; FLV recordings are not affected.** When WebRTC streams are converted to RTMP and then muxed to MP4, the audio and video tracks may start at different timestamps. The MP4 muxer was not accounting for this timing offset between the first audio and video samples in the STTS (Sample Time-to-Sample) table, causing the tracks to be misaligned in the final MP4 file. Introduces `SrsMp4DvrJitter` class specifically for MP4 audio/video synchronization: - **Timestamp Tracking**: Records the DTS of the first audio and video samples - **Offset Calculation**: Computes the timing difference between track start times - **MP4 STTS Correction**: Sets appropriate `sample_delta` values in the MP4 STTS table to maintain proper A/V sync - Added `SrsMp4DvrJitter` class in `srs_kernel_mp4.hpp/cpp` - Integrated jitter correction into `SrsMp4SampleManager::write_track()` for MP4 format only - Added comprehensive unit tests covering various timing scenarios - **Scope**: Changes are isolated to MP4 kernel code and do not affect FLV processing This fix ensures that MP4 DVR recordings from WebRTC streams maintain proper audio/video synchronization regardless of the relative timing of the first audio and video frames, while leaving FLV format processing unchanged. --------- Co-authored-by: Haibo Chen <[email protected]> Co-authored-by: john <[email protected]> Co-authored-by: winlin <[email protected]> Co-authored-by: OSSRS-AI <[email protected]>
1 parent 02952f2 commit 8e57364

File tree

5 files changed

+225
-3
lines changed

5 files changed

+225
-3
lines changed

trunk/doc/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ The changelog for SRS.
77
<a name="v6-changes"></a>
88

99
## SRS 6.0 Changelog
10+
* v6.0, 2025-08-12, Merge [#4230](https://github.com/ossrs/srs/pull/4230): MP4 DVR: Fix audio/video synchronization issues in WebRTC recordings. v6.0.172 (#4230)
1011
* v6.0, 2025-08-11, Merge [#4432](https://github.com/ossrs/srs/pull/4432): AI: HTTP-FLV: Fix heap-use-after-free crash during stream unmount. v6.0.171 (#4432)
1112
* v6.0, 2025-07-28, Merge [#4245](https://github.com/ossrs/srs/pull/4245): Allow Forward to be configured with Env Var. v6.0.170 (#4245)
1213
* v6.0, 2025-07-10, Merge [#4414](https://github.com/ossrs/srs/pull/4414): Fix H.264 B-frame detection logic to comply with specification. v6.0.169 (#4414)

trunk/src/core/srs_core_version6.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
#ifndef SRS_CORE_VERSION6_HPP
88
#define SRS_CORE_VERSION6_HPP
99

10-
#define VERSION_MAJOR 6
11-
#define VERSION_MINOR 0
12-
#define VERSION_REVISION 171
10+
#define VERSION_MAJOR 6
11+
#define VERSION_MINOR 0
12+
#define VERSION_REVISION 172
1313

1414
#endif

trunk/src/kernel/srs_kernel_mp4.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4818,12 +4818,60 @@ uint32_t SrsMp4Sample::pts_ms()
48184818
return (uint32_t)(pts * 1000 / tbn) + adjust;
48194819
}
48204820

4821+
SrsMp4DvrJitter::SrsMp4DvrJitter()
4822+
{
4823+
reset();
4824+
}
4825+
4826+
SrsMp4DvrJitter::~SrsMp4DvrJitter()
4827+
{
4828+
}
4829+
4830+
void SrsMp4DvrJitter::on_sample(SrsMp4Sample *sample)
4831+
{
4832+
if (!has_first_audio_ && sample->type == SrsFrameTypeAudio) {
4833+
has_first_audio_ = true;
4834+
audio_start_dts_ = sample->dts;
4835+
}
4836+
4837+
if (!has_first_video_ && sample->type == SrsFrameTypeVideo) {
4838+
has_first_video_ = true;
4839+
video_start_dts_ = sample->dts;
4840+
}
4841+
}
4842+
4843+
uint32_t SrsMp4DvrJitter::get_first_sample_delta(SrsFrameType track)
4844+
{
4845+
if (track == SrsFrameTypeVideo) {
4846+
return video_start_dts_ > audio_start_dts_ ? video_start_dts_ - audio_start_dts_ : 0;
4847+
} else if (track == SrsFrameTypeAudio) {
4848+
return audio_start_dts_ > video_start_dts_ ? audio_start_dts_ - video_start_dts_ : 0;
4849+
}
4850+
return 0;
4851+
}
4852+
4853+
void SrsMp4DvrJitter::reset()
4854+
{
4855+
video_start_dts_ = 0;
4856+
audio_start_dts_ = 0;
4857+
has_first_video_ = false;
4858+
has_first_audio_ = false;
4859+
}
4860+
4861+
bool SrsMp4DvrJitter::is_initialized()
4862+
{
4863+
return has_first_video_ && has_first_audio_;
4864+
}
4865+
48214866
SrsMp4SampleManager::SrsMp4SampleManager()
48224867
{
4868+
jitter_ = new SrsMp4DvrJitter();
48234869
}
48244870

48254871
SrsMp4SampleManager::~SrsMp4SampleManager()
48264872
{
4873+
srs_freep(jitter_);
4874+
48274875
vector<SrsMp4Sample*>::iterator it;
48284876
for (it = samples.begin(); it != samples.end(); ++it) {
48294877
SrsMp4Sample* sample = *it;
@@ -4900,6 +4948,7 @@ SrsMp4Sample* SrsMp4SampleManager::at(uint32_t index)
49004948

49014949
void SrsMp4SampleManager::append(SrsMp4Sample* sample)
49024950
{
4951+
jitter_->on_sample(sample);
49034952
samples.push_back(sample);
49044953
}
49054954

@@ -5077,6 +5126,7 @@ srs_error_t SrsMp4SampleManager::write_track(SrsFrameType track,
50775126
} else {
50785127
// The first sample always in the STTS table.
50795128
stts_entry.sample_count++;
5129+
stts_entry.sample_delta = jitter_->get_first_sample_delta(track);
50805130
}
50815131
}
50825132

trunk/src/kernel/srs_kernel_mp4.hpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1903,6 +1903,34 @@ class SrsMp4Sample
19031903
virtual uint32_t pts_ms();
19041904
};
19051905

1906+
// MP4 DVR jitter for audio/video synchronization in DVR recordings.
1907+
// Handles timing offset between audio and video tracks to ensure proper A/V sync in MP4 files.
1908+
class SrsMp4DvrJitter
1909+
{
1910+
private:
1911+
uint64_t video_start_dts_;
1912+
uint64_t audio_start_dts_;
1913+
bool has_first_video_;
1914+
bool has_first_audio_;
1915+
1916+
public:
1917+
SrsMp4DvrJitter();
1918+
virtual ~SrsMp4DvrJitter();
1919+
1920+
public:
1921+
// Record the first sample timestamp for each track type
1922+
virtual void on_sample(SrsMp4Sample *sample);
1923+
// Calculate the initial STTS delta for the first sample of a track
1924+
// to maintain A/V synchronization in MP4 files
1925+
virtual uint32_t get_first_sample_delta(SrsFrameType track);
1926+
1927+
private:
1928+
// Reset the jitter state (useful for new recording sessions)
1929+
virtual void reset();
1930+
// Check if both audio and video start times have been captured
1931+
virtual bool is_initialized();
1932+
};
1933+
19061934
// Build samples from moov, or write samples to moov.
19071935
// One or more sample are grouped to a chunk, each track contains one or more chunks.
19081936
// The offset of chunk is specified by stco.
@@ -1914,6 +1942,9 @@ class SrsMp4Sample
19141942
// The keyframe is specified by stss.
19151943
class SrsMp4SampleManager
19161944
{
1945+
private:
1946+
SrsMp4DvrJitter *jitter_; // MP4 A/V sync jitter handler
1947+
19171948
public:
19181949
std::vector<SrsMp4Sample*> samples;
19191950
public:

trunk/src/utest/srs_utest_mp4.cpp

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1869,3 +1869,143 @@ VOID TEST(KernelMp4Test, SrsMp4M2tsInitEncoder)
18691869
}
18701870
}
18711871

1872+
VOID TEST(KernelMp4Test, SrsMp4DvrJitter)
1873+
{
1874+
// Test basic initialization
1875+
if (true) {
1876+
SrsMp4DvrJitter jitter;
1877+
1878+
// Should not be initialized yet
1879+
EXPECT_FALSE(jitter.is_initialized());
1880+
1881+
// Delta should be 0 for uninitialized jitter
1882+
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeVideo));
1883+
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio));
1884+
}
1885+
1886+
// Test audio first scenario
1887+
if (true) {
1888+
SrsMp4DvrJitter jitter;
1889+
1890+
// Create audio sample that arrives first
1891+
SrsMp4Sample audio_sample;
1892+
audio_sample.type = SrsFrameTypeAudio;
1893+
audio_sample.dts = 1000; // Audio starts at 1000us
1894+
1895+
// Create video sample that arrives later
1896+
SrsMp4Sample video_sample;
1897+
video_sample.type = SrsFrameTypeVideo;
1898+
video_sample.dts = 2000; // Video starts at 2000us
1899+
1900+
// Process samples
1901+
jitter.on_sample(&audio_sample);
1902+
jitter.on_sample(&video_sample);
1903+
1904+
// Should be initialized now
1905+
EXPECT_TRUE(jitter.is_initialized());
1906+
1907+
// Video should have delta = video_start - audio_start = 2000 - 1000 = 1000
1908+
EXPECT_EQ(1000, jitter.get_first_sample_delta(SrsFrameTypeVideo));
1909+
1910+
// Audio should have delta = 0 (since audio started first)
1911+
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio));
1912+
}
1913+
1914+
// Test video first scenario
1915+
if (true) {
1916+
SrsMp4DvrJitter jitter;
1917+
1918+
// Create video sample that arrives first
1919+
SrsMp4Sample video_sample;
1920+
video_sample.type = SrsFrameTypeVideo;
1921+
video_sample.dts = 500; // Video starts at 500us
1922+
1923+
// Create audio sample that arrives later
1924+
SrsMp4Sample audio_sample;
1925+
audio_sample.type = SrsFrameTypeAudio;
1926+
audio_sample.dts = 1500; // Audio starts at 1500us
1927+
1928+
// Process samples
1929+
jitter.on_sample(&video_sample);
1930+
jitter.on_sample(&audio_sample);
1931+
1932+
// Should be initialized now
1933+
EXPECT_TRUE(jitter.is_initialized());
1934+
1935+
// Audio should have delta = audio_start - video_start = 1500 - 500 = 1000
1936+
EXPECT_EQ(1000, jitter.get_first_sample_delta(SrsFrameTypeAudio));
1937+
1938+
// Video should have delta = 0 (since video started first)
1939+
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeVideo));
1940+
}
1941+
1942+
// Test same start time scenario
1943+
if (true) {
1944+
SrsMp4DvrJitter jitter;
1945+
1946+
// Create samples with same start time
1947+
SrsMp4Sample audio_sample;
1948+
audio_sample.type = SrsFrameTypeAudio;
1949+
audio_sample.dts = 1000;
1950+
1951+
SrsMp4Sample video_sample;
1952+
video_sample.type = SrsFrameTypeVideo;
1953+
video_sample.dts = 1000;
1954+
1955+
// Process samples
1956+
jitter.on_sample(&audio_sample);
1957+
jitter.on_sample(&video_sample);
1958+
1959+
// Should be initialized now
1960+
EXPECT_TRUE(jitter.is_initialized());
1961+
1962+
// Both should have delta = 0 (same start time)
1963+
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeVideo));
1964+
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio));
1965+
}
1966+
1967+
// Test reset functionality
1968+
if (true) {
1969+
SrsMp4DvrJitter jitter;
1970+
1971+
// Initialize with samples
1972+
SrsMp4Sample audio_sample;
1973+
audio_sample.type = SrsFrameTypeAudio;
1974+
audio_sample.dts = 1000;
1975+
1976+
jitter.on_sample(&audio_sample);
1977+
1978+
// Reset and verify
1979+
jitter.reset();
1980+
EXPECT_FALSE(jitter.is_initialized());
1981+
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeVideo));
1982+
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio));
1983+
}
1984+
1985+
// Test multiple samples of same type (should only record first)
1986+
if (true) {
1987+
SrsMp4DvrJitter jitter;
1988+
1989+
// Create multiple audio samples
1990+
SrsMp4Sample audio1;
1991+
audio1.type = SrsFrameTypeAudio;
1992+
audio1.dts = 1000;
1993+
1994+
SrsMp4Sample audio2;
1995+
audio2.type = SrsFrameTypeAudio;
1996+
audio2.dts = 2000; // This should be ignored
1997+
1998+
SrsMp4Sample video1;
1999+
video1.type = SrsFrameTypeVideo;
2000+
video1.dts = 1500;
2001+
2002+
// Process samples
2003+
jitter.on_sample(&audio1);
2004+
jitter.on_sample(&audio2); // Should be ignored
2005+
jitter.on_sample(&video1);
2006+
2007+
// Should use first audio sample (1000) not second (2000)
2008+
EXPECT_EQ(500, jitter.get_first_sample_delta(SrsFrameTypeVideo)); // 1500 - 1000 = 500
2009+
EXPECT_EQ(0, jitter.get_first_sample_delta(SrsFrameTypeAudio));
2010+
}
2011+
}

0 commit comments

Comments
 (0)