Skip to content

Commit c424bce

Browse files
author
chaoyuepan
committed
#19 support audio
1 parent 5298235 commit c424bce

13 files changed

Lines changed: 1116 additions & 97 deletions

File tree

.env.example

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,30 @@ CHUNK_OVERLAP=200
4949
# Set to false to use original simple text extraction (may not work well for binary formats)
5050
ENABLE_MARKITDOWN=true
5151

52+
# Audio Transcription Configuration
53+
# ============================
54+
# Enable vosk-transcriber for converting audio files to text
55+
# Requires vosk-transcriber CLI tool to be installed
56+
# Installation: https://github.com/alphacep/vosk-transcriber
57+
# Supported formats: mp3, wav, m4a, aac, flac, ogg, wma, opus, mp4, avi, mkv, mov, webm
58+
ENABLE_VOSK_TRANSCRIBER=true
59+
60+
# Path to vosk model directory (download from https://alphacephei.com/vosk/models)
61+
#
62+
# English models:
63+
# - Small: vosk-model-small-en-us-0.15 (fast, good accuracy)
64+
# - Full: vosk-model-en-us-0.22 (slower, better accuracy)
65+
# Path example: /root/.cache/vosk/vosk-model-small-en-us-0.15
66+
#
67+
# Chinese models:
68+
# - Small: vosk-model-small-cn-0.22 (fast, good for Mandarin)
69+
# Path example: /root/.cache/vosk/vosk-model-small-cn-0.22
70+
#
71+
# Download and extract:
72+
# wget https://alphacephei.com/vosk/models/vosk-model-small-cn-0.22.zip
73+
# unzip vosk-model-small-cn-0.22.zip -d /root/.cache/vosk/
74+
VOSK_MODEL_PATH=/root/.cache/vosk/vosk-model-small-cn-0.22
75+
5276
# 允许删除(默认为 true)
5377
ALLOW_DELETE=true
5478

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ An AI-powered knowledge management application that lets you create intelligent
2121

2222
## ✨ Features
2323

24-
- 📚 **Multiple Source Types** - Upload PDFs, text files, Markdown, DOCX, HTML documents, and video URLs (YouTube, Bilibili with automatic subtitle extraction)
24+
- 📚 **Multiple Source Types** - Upload PDFs, text files, Markdown, DOCX, HTML documents, audio files (MP3, WAV, M4A, etc.), and video URLs (YouTube, Bilibili with automatic subtitle extraction)
2525
- 🤖 **AI-Powered Chat** - Ask questions and get answers based on your sources
2626
-**Multiple Transformations** - Generate summaries, FAQs, study guides, outlines, timelines, glossaries, quizzes, mindmaps, infographics and podcast scripts
2727
- 📊 **Infographic Generation** - Create beautiful, hand-drawn style infographics from your content using Google's Gemini Nano Banana
@@ -38,6 +38,7 @@ An AI-powered knowledge management application that lets you create intelligent
3838
- An LLM API key (OpenAI) or Ollama running locally
3939
- [markitdown](https://github.com/microsoft/markitdown) (optional, for better document conversion)
4040
- [yt-dlp](https://github.com/yt-dlp/yt-dlp) (optional, for extracting subtitles from YouTube and Bilibili videos)
41+
- [vosk-transcriber](https://github.com/alphacep/vosk-transcriber) (optional, for transcribing audio files to text)
4142

4243
### Installation
4344

@@ -171,6 +172,7 @@ You can add content to your notebook in three ways:
171172
- Click the "+" button in the Sources panel
172173
- Drag and drop or browse for files
173174
- Supported: PDF, TXT, MD, DOCX, HTML
175+
- Audio files: MP3, WAV, M4A, AAC, FLAC, OGG, WMA, OPUS (auto-transcribed to text)
174176

175177
**Paste Text**
176178

@@ -230,6 +232,10 @@ CHUNK_OVERLAP=200 # Overlap between chunks
230232
# Document Conversion
231233
ENABLE_MARKITDOWN=true # Use Microsoft markitdown for better PDF/DOCX conversion
232234
235+
# Audio Transcription
236+
ENABLE_VOSK_TRANSCRIBER=false # Enable audio file transcription
237+
VOSK_MODEL_PATH=/usr/local/share/vosk-model-en # Path to vosk model
238+
233239
# Podcast Generation
234240
ENABLE_PODCAST=true
235241
PODCAST_VOICE=alloy # Options: alloy, echo, fable, onyx, nova, shimmer

README_CN.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,13 @@ CHUNK_OVERLAP=200 # 分块重叠
230230
# 文档转换
231231
ENABLE_MARKITDOWN=true # 使用 Microsoft markitdown 更好地转换 PDF/DOCX
232232
233+
# 音频转录
234+
ENABLE_VOSK_TRANSCRIBER=false # 启用音频文件转录
235+
# 中文模型路径示例
236+
VOSK_MODEL_PATH=/root/.cache/vosk/vosk-model-small-cn-0.22
237+
# 英文模型路径示例
238+
# VOSK_MODEL_PATH=/root/.cache/vosk/vosk-model-small-en-us-0.15
239+
233240
# 播客生成
234241
ENABLE_PODCAST=true
235242
PODCAST_VOICE=alloy # 选项:alloy、echo、fable、onyx、nova、shimmer

backend/config.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ type Config struct {
5858
// Document conversion
5959
EnableMarkitdown bool
6060

61+
// Audio transcription
62+
EnableVoskTranscriber bool
63+
VoskModelPath string // Path to vosk model directory
64+
6165
// Demo settings
6266
AllowMultipleNotesOfSameType bool
6367

@@ -133,6 +137,8 @@ func LoadConfig() Config {
133137
EnablePodcast: getEnvBool("ENABLE_PODCAST", true),
134138
PodcastVoice: getEnv("PODCAST_VOICE", "alloy"),
135139
EnableMarkitdown: getEnvBool("ENABLE_MARKITDOWN", true),
140+
EnableVoskTranscriber: getEnvBool("ENABLE_VOSK_TRANSCRIBER", false),
141+
VoskModelPath: getEnv("VOSK_MODEL_PATH", "/usr/local/share/vosk-model-en"),
136142
AllowMultipleNotesOfSameType: getEnvBool("ALLOW_MULTIPLE_NOTES_OF_SAME_TYPE", true),
137143
LangChainAPIKey: getEnv("LANGCHAIN_API_KEY", ""),
138144
LangChainProject: getEnv("LANGCHAIN_PROJECT", "notex"),

backend/frontend/index.html

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -203,15 +203,15 @@ <h2 class="panel-title">来源</h2>
203203
<div class="panel-header">
204204
<div class="panel-tabs" id="centerPanelTabs">
205205
<button class="tab-btn active" data-tab="chat">对话</button>
206-
<button class="tab-btn" data-tab="note" id="tabBtnNote">
207-
笔记
208-
<span class="tab-close" id="btnCloseNote">×</span>
206+
<button class="tab-btn" data-tab="sessions" id="tabBtnSessions">
207+
会话历史
209208
</button>
210-
<button class="tab-btn hidden" data-tab="notes_list" id="tabBtnNotesList">
209+
<button class="tab-btn" data-tab="notes_list" id="tabBtnNotesList">
211210
笔记列表
212211
</button>
213-
<button class="tab-btn" data-tab="sessions" id="tabBtnSessions">
214-
会话历史
212+
<button class="tab-btn" data-tab="note" id="tabBtnNote">
213+
笔记
214+
<span class="tab-close" id="btnCloseNote">×</span>
215215
</button>
216216
</div>
217217
<div class="panel-header-actions">

0 commit comments

Comments
 (0)