-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.example.yaml
More file actions
80 lines (63 loc) · 3.08 KB
/
config.example.yaml
File metadata and controls
80 lines (63 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# EPUB LLM Cleaner Configuration
# ================================
# Copy this file to config.yaml and customize for your needs.
# All paths can be absolute or relative to this config file.
# ============================================================================
# LLM Provider Settings
# ============================================================================
# The LLM model to use for text analysis and cleaning
# Examples: claude-sonnet-4-5-20250929, claude-opus-4-5-20250929, gpt-4o, gpt-4-turbo
model: claude-sonnet-4-5-20250929
# The LLM provider to use
# Supported: anthropic, openai
# The appropriate API key environment variable must be set:
# - anthropic: ANTHROPIC_API_KEY
# - openai: OPENAI_API_KEY
provider: anthropic
# ============================================================================
# Input/Output Files
# ============================================================================
# Path to the input EPUB file to process
input_file: input.epub
# Path for the cleaned output EPUB file
output_file: output_cleaned.epub
# ============================================================================
# HTML Parsing Settings
# ============================================================================
# CSS selectors for identifying paragraph elements to process
# These selectors target the main body text paragraphs in the EPUB
# Common patterns:
# - ".x04-body-text" - Standard body text paragraphs
# - ".x04-body-text-fl" - First-line styled body text (common in published ebooks)
# - "p.body" - Generic body paragraphs
# - ".chapter-text" - Chapter content paragraphs
# You may need to inspect your EPUB's HTML to find the correct selectors
paragraph_selectors:
- ".x04-body-text"
- ".x04-body-text-fl"
# ============================================================================
# API Rate Limiting
# ============================================================================
# Delay in seconds between API calls to avoid rate limiting
# Increase this value if you encounter rate limit errors
# Set to 0 to disable (not recommended for large files)
rate_limit_delay: 0.1
# ============================================================================
# Token Limits
# ============================================================================
# Maximum tokens for the analysis phase (quick check if chapter needs cleaning)
# This should be small since analysis only returns "FILTER" or "PASS"
max_tokens_analysis: 10
# Maximum tokens for the cleaning phase (actual text rewriting)
# This should be large enough to handle the cleaned paragraph output
# Increase if you have very long chapters or many paragraphs to clean
max_tokens_cleaning: 8000
# ============================================================================
# Advanced Settings (Optional)
# ============================================================================
# Path to custom prompts file (default: filter_prompts.txt in same directory)
# prompts_file: filter_prompts.txt
# Enable verbose logging for debugging
# verbose: false
# Dry run mode - analyze without making changes
# dry_run: false