Skip to content

Commit 4cf210e

Browse files
authored
fix: resample and downmix WAV files for Tencent Silk encoding (#9100)
* fix: resample and downmix WAV files for Tencent Silk encoding * fix: improve WAV to Tencent Silk conversion by handling sample width and resampling
1 parent 372b9f5 commit 4cf210e

2 files changed

Lines changed: 120 additions & 9 deletions

File tree

astrbot/core/utils/tencent_record_helper.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
11
"""Tencent Silk audio conversion helpers."""
22

33
import asyncio
4+
import audioop
45
import os
56
import subprocess
67
import wave
78
from io import BytesIO
89

910
from astrbot.core import logger
1011

12+
# The SILK SDK only supports these rates
13+
_PYSILK_SUPPORTED_RATES = frozenset({8000, 12000, 16000, 24000, 32000, 48000})
14+
1115

1216
async def tencent_silk_to_wav(silk_path: str, output_path: str) -> str:
1317
"""Decode a Tencent Silk file to 24 kHz mono PCM WAV.
@@ -69,16 +73,27 @@ async def wav_to_tencent_silk(wav_path: str, output_path: str) -> float:
6973

7074
with wave.open(wav_path, "rb") as wav:
7175
rate = wav.getframerate()
72-
frames = wav.getnframes()
73-
pcm_data = wav.readframes(frames)
76+
channels = wav.getnchannels()
77+
sampwidth = wav.getsampwidth()
78+
pcm_data = wav.readframes(wav.getnframes())
79+
80+
# Downmix to mono, resample to 24 kHz if needed, and convert to 16-bit PCM
81+
# (pysilk only accepts 16-bit linear PCM)
82+
if channels == 2:
83+
pcm_data = audioop.tomono(pcm_data, sampwidth, 0.5, 0.5)
84+
if rate not in _PYSILK_SUPPORTED_RATES:
85+
pcm_data, _ = audioop.ratecv(pcm_data, sampwidth, 1, rate, 24000, None)
86+
rate = 24000
87+
if sampwidth != 2:
88+
pcm_data = audioop.lin2lin(pcm_data, sampwidth, 2)
7489

7590
input_io = BytesIO(pcm_data)
7691
output_io = BytesIO()
7792
# tencent=True makes pysilk emit the QQ-compatible 0x02-prefixed SILK stream.
7893
pysilk.encode(input_io, output_io, rate, rate, tencent=True)
7994
with open(output_path, "wb") as f:
8095
f.write(output_io.getvalue())
81-
return frames / rate if rate else 0
96+
return len(pcm_data) / (2 * rate) if rate else 0
8297

8398

8499
async def convert_to_pcm_wav(input_path: str, output_path: str) -> str:

tests/test_media_utils.py

Lines changed: 102 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import math
33
import os
44
import struct
5+
import sys
56
import wave
67
from io import BytesIO
78
from pathlib import Path
@@ -653,19 +654,39 @@ def test_path_mapping_accepts_standard_and_legacy_file_uri(tmp_path):
653654

654655

655656
@pytest.mark.asyncio
656-
async def test_tencent_silk_encoding_uses_pysilk_tencent_format(tmp_path, monkeypatch):
657+
@pytest.mark.parametrize(
658+
"rate, channels",
659+
[
660+
(24000, 1), # supported, no resample
661+
(44100, 1), # unsupported rate, triggers resample
662+
(22050, 1), # unsupported rate, triggers resample
663+
(48000, 2), # stereo at supported rate, triggers downmix
664+
(44100, 2), # stereo + unsupported rate, triggers both
665+
],
666+
ids=["24k-mono", "44.1k-mono", "22.05k-mono", "48k-stereo", "44.1k-stereo"],
667+
)
668+
async def test_tencent_silk_encoding_uses_pysilk_tencent_format(
669+
rate, channels, tmp_path, monkeypatch
670+
):
671+
"""Real pysilk end-to-end across sample rates that previously failed.
672+
673+
44100 Hz was the regression trigger: pysilk rejects it with
674+
ENC_INPUT_INVALID_NO_OF_SAMPLES. The fix resamples to 24 kHz mono via
675+
audioop.ratecv before encoding.
676+
"""
657677
monkeypatch.setattr(media_utils, "get_astrbot_temp_path", lambda: str(tmp_path))
658678
wav_path = tmp_path / "tone.wav"
659679
silk_path = tmp_path / "tone.silk"
660-
rate = 24000
661-
frames = int(rate * 0.2)
680+
secs = 0.2
681+
frames = int(rate * secs)
662682
with wave.open(str(wav_path), "wb") as wav:
663-
wav.setnchannels(1)
683+
wav.setnchannels(channels)
664684
wav.setsampwidth(2)
665685
wav.setframerate(rate)
666686
for i in range(frames):
667687
sample = int(0.2 * 32767 * math.sin(2 * math.pi * 440 * i / rate))
668-
wav.writeframesraw(struct.pack("<h", sample))
688+
for _ in range(channels):
689+
wav.writeframesraw(struct.pack("<h", sample))
669690

670691
duration = await wav_to_tencent_silk(str(wav_path), str(silk_path))
671692
silk_bytes = silk_path.read_bytes()
@@ -679,7 +700,82 @@ async def test_tencent_silk_encoding_uses_pysilk_tencent_format(tmp_path, monkey
679700
assert resolved.format == "tencent_silk"
680701
assert resolved.mime_type == "audio/silk"
681702

682-
assert duration == pytest.approx(0.2)
703+
assert duration == pytest.approx(secs, abs=0.05)
683704
assert silk_bytes.startswith(b"\x02#!SILK_V3")
684705
assert resolved_silk_bytes.startswith(b"\x02#!SILK_V3")
685706
assert not resolved_silk_path.exists()
707+
708+
709+
def _make_wav(path, rate, channels=1, secs=0.2, freq=440):
710+
"""Write a short sine-tone WAV at the given rate/channels."""
711+
nframes = int(rate * secs)
712+
with wave.open(str(path), "wb") as wav:
713+
wav.setnchannels(channels)
714+
wav.setsampwidth(2)
715+
wav.setframerate(rate)
716+
for i in range(nframes):
717+
sample = int(0.2 * 32767 * math.sin(2 * math.pi * freq * i / rate))
718+
for _ in range(channels):
719+
wav.writeframesraw(struct.pack("<h", sample))
720+
721+
722+
class _FakePysilk:
723+
"""Stand-in for the ``pysilk`` module that records encode() calls."""
724+
725+
def __init__(self):
726+
self.calls = []
727+
728+
def encode(self, input_io, output_io, sample_rate, bit_rate, tencent=True):
729+
self.calls.append({"sample_rate": sample_rate, "tencent": tencent})
730+
output_io.write(b"\x02#!SILK_V3")
731+
732+
733+
@pytest.mark.asyncio
734+
async def test_wav_to_tencent_silk_resamples_unsupported_rate(tmp_path, monkeypatch):
735+
"""44100 Hz input must be resampled to 24 kHz before pysilk.encode."""
736+
fake = _FakePysilk()
737+
monkeypatch.setitem(sys.modules, "pysilk", fake)
738+
739+
wav_path = tmp_path / "tts_44100.wav"
740+
_make_wav(wav_path, 44100)
741+
742+
silk_path = tmp_path / "out.silk"
743+
await wav_to_tencent_silk(str(wav_path), str(silk_path))
744+
745+
assert len(fake.calls) == 1
746+
assert fake.calls[0]["sample_rate"] == 24000
747+
assert fake.calls[0]["tencent"] is True
748+
assert silk_path.read_bytes().startswith(b"\x02#!SILK_V3")
749+
750+
751+
@pytest.mark.asyncio
752+
async def test_wav_to_tencent_silk_resamples_stereo(tmp_path, monkeypatch):
753+
"""Stereo input at a supported rate must still be downmixed to mono."""
754+
fake = _FakePysilk()
755+
monkeypatch.setitem(sys.modules, "pysilk", fake)
756+
757+
wav_path = tmp_path / "stereo_48k.wav"
758+
_make_wav(wav_path, 48000, channels=2)
759+
760+
await wav_to_tencent_silk(str(wav_path), str(tmp_path / "out.silk"))
761+
762+
assert len(fake.calls) == 1
763+
# 48000 Hz is supported, so only downmix happens -- rate stays unchanged.
764+
assert fake.calls[0]["sample_rate"] == 48000
765+
766+
767+
@pytest.mark.asyncio
768+
async def test_wav_to_tencent_silk_skips_resample_for_supported_rate(
769+
tmp_path, monkeypatch
770+
):
771+
"""24000 Hz mono must go straight to pysilk without resampling."""
772+
fake = _FakePysilk()
773+
monkeypatch.setitem(sys.modules, "pysilk", fake)
774+
775+
wav_path = tmp_path / "tone_24k.wav"
776+
_make_wav(wav_path, 24000)
777+
778+
await wav_to_tencent_silk(str(wav_path), str(tmp_path / "out.silk"))
779+
780+
assert len(fake.calls) == 1
781+
assert fake.calls[0]["sample_rate"] == 24000

0 commit comments

Comments
 (0)