Skip to content

Commit 5298235

Browse files
author
chaoyuepan
committed
#18 support youtube/bilibili videos
1 parent 2e3f92c commit 5298235

3 files changed

Lines changed: 150 additions & 4 deletions

File tree

README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ An AI-powered knowledge management application that lets you create intelligent
2121

2222
## ✨ Features
2323

24-
- 📚 **Multiple Source Types** - Upload PDFs, text files, Markdown, DOCX, and HTML documents
24+
- 📚 **Multiple Source Types** - Upload PDFs, text files, Markdown, DOCX, HTML documents, and video URLs (YouTube, Bilibili with automatic subtitle extraction)
2525
- 🤖 **AI-Powered Chat** - Ask questions and get answers based on your sources
2626
-**Multiple Transformations** - Generate summaries, FAQs, study guides, outlines, timelines, glossaries, quizzes, mindmaps, infographics and podcast scripts
2727
- 📊 **Infographic Generation** - Create beautiful, hand-drawn style infographics from your content using Google's Gemini Nano Banana
@@ -36,6 +36,8 @@ An AI-powered knowledge management application that lets you create intelligent
3636

3737
- Go 1.23 or later
3838
- An LLM API key (OpenAI) or Ollama running locally
39+
- [markitdown](https://github.com/microsoft/markitdown) (optional, for better document conversion)
40+
- [yt-dlp](https://github.com/yt-dlp/yt-dlp) (optional, for extracting subtitles from YouTube and Bilibili videos)
3941

4042
### Installation
4143

@@ -179,6 +181,8 @@ You can add content to your notebook in three ways:
179181

180182
- Select the "URL" tab
181183
- Enter the URL and optional title
184+
- Supports web pages and video URLs (YouTube, Bilibili)
185+
- For videos, subtitles are automatically extracted and used as content
182186

183187
### Chatting with Sources
184188

README_CN.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ AI 驱动知识管理应用程序,让您从文档中创建智能笔记本。
2121

2222
## ✨ 特性
2323

24-
- 📚 **多种来源类型** - 支持上传 PDF、文本文件、Markdown、DOCX 和 HTML 文档
24+
- 📚 **多种来源类型** - 支持上传 PDF、文本文件、Markdown、DOCX、HTML 文档和视频 URL(YouTube、Bilibili 自动提取字幕)
2525
- 🤖 **AI 驱动对话** - 基于您的来源提问并获得答案
2626
-**多种转换** - 生成摘要、FAQ、学习指南、大纲、时间线、词汇表、测验、思维导图、信息图和播客脚本
2727
- 📊 **信息图生成** - 使用 Google Gemini Nano Banana 从您的内容创建精美的手绘风格信息图
@@ -36,6 +36,8 @@ AI 驱动知识管理应用程序,让您从文档中创建智能笔记本。
3636

3737
- Go 1.23 或更高版本
3838
- LLM API 密钥 (OpenAI) 或本地运行的 Ollama
39+
- [markitdown](https://github.com/microsoft/markitdown)(可选,用于更好的文档转换)
40+
- [yt-dlp](https://github.com/yt-dlp/yt-dlp)(可选,用于从 YouTube 和 Bilibili 视频提取字幕)
3941

4042
### 安装
4143

@@ -179,6 +181,8 @@ go build -o notex .
179181

180182
- 选择 "URL" 标签
181183
- 输入 URL 和可选标题
184+
- 支持网页和视频 URL(YouTube、Bilibili)
185+
- 对于视频,会自动提取字幕作为内容
182186

183187
### 与来源对话
184188

backend/vector.go

Lines changed: 140 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,141 @@ func (vs *VectorStore) needsMarkitdown(ext string) bool {
350350
return markitdownExts[ext]
351351
}
352352

353+
354+
// isVideoURL checks if the URL is from a video platform (YouTube, Bilibili)
355+
func isVideoURL(url string) bool {
356+
patterns := []string{
357+
"youtube.com",
358+
"youtu.be",
359+
"bilibili.com",
360+
"b23.tv",
361+
}
362+
lowerURL := strings.ToLower(url)
363+
for _, pattern := range patterns {
364+
if strings.Contains(lowerURL, pattern) {
365+
return true
366+
}
367+
}
368+
return false
369+
}
370+
371+
// extractVideoSubtitle extracts subtitle from video URL using yt-dlp
372+
func (vs *VectorStore) extractVideoSubtitle(url string) (string, error) {
373+
fmt.Printf("[VectorStore] Extracting subtitle from video: %s\n", url)
374+
375+
// Create temporary file for subtitle
376+
tmpSubFile := filepath.Join(os.TempDir(), fmt.Sprintf("subtitle_%d.srt", os.Getpid()))
377+
378+
// Use yt-dlp to download subtitles
379+
// --write-subs: write subtitles
380+
// --sub-langs all: download all available languages
381+
// --skip-download: don't download the video itself
382+
// --sub-format srt: use SRT format
383+
// -o: output file
384+
cmd := exec.Command("yt-dlp",
385+
"--write-subs",
386+
"--sub-langs", "all",
387+
"--skip-download",
388+
"--sub-format", "srt",
389+
"-o", tmpSubFile,
390+
url,
391+
)
392+
393+
output, err := cmd.CombinedOutput()
394+
if err != nil {
395+
fmt.Printf("[VectorStore] yt-dlp error: %s\n", string(output))
396+
return "", fmt.Errorf("failed to extract subtitle: %w, output: %s", err, string(output))
397+
}
398+
399+
// Find the actual subtitle file (yt-dlp may add language suffix)
400+
matches, _ := filepath.Glob(tmpSubFile + "*.srt")
401+
if len(matches) == 0 {
402+
return "", fmt.Errorf("no subtitle file found")
403+
}
404+
actualSubFile := matches[0]
405+
406+
// Read the subtitle file
407+
subContent, err := os.ReadFile(actualSubFile)
408+
if err != nil {
409+
os.Remove(actualSubFile)
410+
return "", fmt.Errorf("failed to read subtitle file: %w", err)
411+
}
412+
413+
// Clean up subtitle file
414+
os.Remove(actualSubFile)
415+
416+
// Convert SRT to plain text format
417+
textContent := vs.convertSRTToText(string(subContent))
418+
419+
fmt.Printf("[VectorStore] Subtitle extracted successfully, size: %d bytes\n", len(textContent))
420+
return textContent, nil
421+
}
422+
423+
// convertSRTToText converts SRT subtitle format to readable text
424+
func (vs *VectorStore) convertSRTToText(srtContent string) string {
425+
lines := strings.Split(srtContent, "\n")
426+
var result []string
427+
currentText := []string{}
428+
429+
for _, line := range lines {
430+
line = strings.TrimSpace(line)
431+
432+
// Skip empty lines
433+
if line == "" {
434+
if len(currentText) > 0 {
435+
result = append(result, strings.Join(currentText, " "))
436+
currentText = []string{}
437+
}
438+
continue
439+
}
440+
441+
// Skip sequence numbers (1, 2, 3, ...)
442+
if _, err := fmt.Sscanf(line, "%d", new(int)); err == nil && len(line) < 10 {
443+
if len(currentText) > 0 {
444+
result = append(result, strings.Join(currentText, " "))
445+
currentText = []string{}
446+
}
447+
continue
448+
}
449+
450+
// Skip timestamp lines (00:00:00,000 --> 00:00:05,000)
451+
if strings.Contains(line, "-->") {
452+
if len(currentText) > 0 {
453+
result = append(result, strings.Join(currentText, " "))
454+
currentText = []string{}
455+
}
456+
continue
457+
}
458+
459+
// Skip hex/hash lines (sometimes in SRT files)
460+
if strings.HasPrefix(line, "#") {
461+
continue
462+
}
463+
464+
// Remove HTML tags and common subtitle artifacts
465+
line = strings.ReplaceAll(line, "<i>", "")
466+
line = strings.ReplaceAll(line, "</i>", "")
467+
line = strings.ReplaceAll(line, "<b>", "")
468+
line = strings.ReplaceAll(line, "</b>", "")
469+
line = strings.ReplaceAll(line, "&lt;", "<")
470+
line = strings.ReplaceAll(line, "&gt;", ">")
471+
line = strings.ReplaceAll(line, "&amp;", "&")
472+
line = strings.ReplaceAll(line, "&#39;", "'")
473+
line = strings.ReplaceAll(line, "&quot;", "\"")
474+
475+
// Add to current text if it's not empty
476+
if line != "" {
477+
currentText = append(currentText, line)
478+
}
479+
}
480+
481+
// Don't forget the last segment
482+
if len(currentText) > 0 {
483+
result = append(result, strings.Join(currentText, " "))
484+
}
485+
486+
return strings.Join(result, "\n")
487+
}
353488
// ExtractFromURL fetches and converts content from a URL using markitdown
354489
func (vs *VectorStore) ExtractFromURL(ctx context.Context, url string) (string, error) {
355490
fmt.Printf("[VectorStore] Fetching content from URL: %s\n", url)
@@ -358,6 +493,11 @@ func (vs *VectorStore) ExtractFromURL(ctx context.Context, url string) (string,
358493
return "", fmt.Errorf("markitdown is disabled, cannot fetch URL content")
359494
}
360495

496+
// Check if it's a video URL and extract subtitles
497+
if isVideoURL(url) {
498+
return vs.extractVideoSubtitle(url)
499+
}
500+
361501
// Step 1: Use curl to download the webpage to a temporary HTML file
362502
tmpHTMLFile := filepath.Join(os.TempDir(), fmt.Sprintf("webpage_%d.html", os.Getpid()))
363503
tmpMDFile := filepath.Join(os.TempDir(), fmt.Sprintf("markitdown_url_%d.md", os.Getpid()))
@@ -398,8 +538,6 @@ func (vs *VectorStore) ExtractFromURL(ctx context.Context, url string) (string,
398538
fmt.Printf("[VectorStore] URL content fetched and converted successfully, output size: %d bytes\n", len(content))
399539
return string(content), nil
400540
}
401-
402-
// convertWithMarkitdown converts a document to Markdown using the markitdown CLI tool
403541
func (vs *VectorStore) convertWithMarkitdown(filePath string) (string, error) {
404542
fmt.Printf("[VectorStore] Converting with markitdown: %s\n", filePath)
405543

0 commit comments

Comments
 (0)