Skip to content

Commit 112a0ae

Browse files
committed
Filter statistics to only include books in book_path
1 parent 515f12b commit 112a0ae

File tree

6 files changed

+90
-8
lines changed

6 files changed

+90
-8
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ KoShelf can operate in several modes:
215215
- `--min-pages-per-day`: Minimum pages read per book per day to be counted in statistics (optional)
216216
- `--min-time-per-day`: Minimum reading time per book per day to be counted in statistics (e.g., "15m", "1h") (optional)
217217
> **Note:** If both `--min-pages-per-day` and `--min-time-per-day` are provided, a book's data for a day is counted if **either** condition is met for that book on that day. These filters apply **per book per day**, meaning each book must individually meet the threshold for each day to be included in statistics.
218+
- `--include-all-stats`: By default, statistics are filtered to only include books present in your `--books-path` directory. This prevents deleted books or external files (like Wallabag articles) from skewing your recap and statistics. Use this flag to include statistics for all books in the database, regardless of whether they exist in your library.
218219
- `--github`: Print GitHub repository URL
219220

220221
### Example

src/book_scanner.rs

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use std::collections::HashMap;
1+
use std::collections::{HashMap, HashSet};
22
use std::path::PathBuf;
33
use anyhow::{Result, bail};
44
use log::{debug, info, warn};
@@ -151,7 +151,7 @@ fn build_hashdocsettings_index(hashdocsettings_path: &PathBuf) -> Result<HashMap
151151
pub async fn scan_books(
152152
books_path: &PathBuf,
153153
metadata_location: &MetadataLocation,
154-
) -> Result<Vec<Book>> {
154+
) -> Result<(Vec<Book>, HashSet<String>)> {
155155
info!("Scanning books in directory: {:?}", books_path);
156156
let epub_parser = EpubParser::new();
157157
let lua_parser = LuaParser::new();
@@ -168,6 +168,7 @@ pub async fn scan_books(
168168
};
169169

170170
let mut books = Vec::new();
171+
let mut library_md5s = HashSet::new();
171172

172173
// Walk through all epub files
173174
for entry in walkdir::WalkDir::new(books_path) {
@@ -191,6 +192,10 @@ pub async fn scan_books(
191192
}
192193
};
193194

195+
// Track the MD5 for this book (for statistics filtering)
196+
// We may already have it from hashdocsettings lookup, or will get it from metadata
197+
let mut book_md5: Option<String> = None;
198+
194199
// Find metadata based on the configured location
195200
let metadata_path = match metadata_location {
196201
MetadataLocation::InBookFolder => {
@@ -219,6 +224,8 @@ pub async fn scan_books(
219224
match calculate_partial_md5(path) {
220225
Ok(hash) => {
221226
debug!("Calculated partial MD5 for {:?}: {}", path, hash);
227+
// Store the calculated MD5 for later use
228+
book_md5 = Some(hash.clone());
222229
hashdocsettings_index
223230
.as_ref()
224231
.and_then(|idx| idx.get(&hash.to_lowercase()).cloned())
@@ -246,6 +253,24 @@ pub async fn scan_books(
246253
None
247254
};
248255

256+
// Collect MD5 for statistics filtering:
257+
// 1. Prefer MD5 from metadata (stable even if file is updated)
258+
// 2. Use calculated MD5 from hashdocsettings lookup if available
259+
// 3. Fall back to calculating MD5 for books without metadata
260+
if let Some(ref metadata) = koreader_metadata {
261+
if let Some(ref md5) = metadata.partial_md5_checksum {
262+
library_md5s.insert(md5.clone());
263+
} else if let Some(ref md5) = book_md5 {
264+
library_md5s.insert(md5.clone());
265+
} else if let Ok(md5) = calculate_partial_md5(path) {
266+
library_md5s.insert(md5);
267+
}
268+
} else if let Some(ref md5) = book_md5 {
269+
library_md5s.insert(md5.clone());
270+
} else if let Ok(md5) = calculate_partial_md5(path) {
271+
library_md5s.insert(md5);
272+
}
273+
249274
let book = Book {
250275
id: generate_book_id(&epub_info.title),
251276
epub_info,
@@ -258,5 +283,5 @@ pub async fn scan_books(
258283
}
259284

260285
info!("Found {} books!", books.len());
261-
Ok(books)
286+
Ok((books, library_md5s))
262287
}

src/file_watcher.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ pub struct FileWatcher {
2121
time_config: TimeConfig,
2222
min_pages_per_day: Option<u32>,
2323
min_time_per_day: Option<u32>,
24+
include_all_stats: bool,
2425
}
2526

2627
impl FileWatcher {
@@ -35,6 +36,7 @@ impl FileWatcher {
3536
time_config: TimeConfig,
3637
min_pages_per_day: Option<u32>,
3738
min_time_per_day: Option<u32>,
39+
include_all_stats: bool,
3840
) -> Result<Self> {
3941
Ok(Self {
4042
books_path,
@@ -48,6 +50,7 @@ impl FileWatcher {
4850
time_config,
4951
min_pages_per_day,
5052
min_time_per_day,
53+
include_all_stats,
5154
})
5255
}
5356

@@ -114,6 +117,7 @@ impl FileWatcher {
114117
let time_config_clone = self.time_config.clone();
115118
let min_pages_per_day_clone = self.min_pages_per_day;
116119
let min_time_per_day_clone = self.min_time_per_day;
120+
let include_all_stats_clone = self.include_all_stats;
117121

118122
// Spawn delayed rebuild task
119123
let rebuild_task = tokio::task::spawn_blocking(move || {
@@ -142,6 +146,7 @@ impl FileWatcher {
142146
time_config_clone.clone(),
143147
min_pages_per_day_clone,
144148
min_time_per_day_clone,
149+
include_all_stats_clone,
145150
);
146151

147152
match site_generator.generate().await {

src/main.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,14 @@ struct Cli {
9090
#[arg(long, display_order = 14)]
9191
min_time_per_day: Option<String>,
9292

93+
/// Include statistics for all books in the database, not just those in --books-path.
94+
/// By default, when --books-path is provided, statistics are filtered to only include
95+
/// books present in that directory. Use this flag to include all statistics.
96+
#[arg(long, default_value = "false", display_order = 15)]
97+
include_all_stats: bool,
98+
9399
/// Print GitHub repository URL
94-
#[arg(long, display_order = 15)]
100+
#[arg(long, display_order = 16)]
95101
github: bool,
96102
}
97103

@@ -132,6 +138,7 @@ fn parse_time_to_seconds(time_str: &str) -> Result<Option<u32>> {
132138
async fn main() -> Result<()> {
133139
env_logger::builder()
134140
.filter_level(log::LevelFilter::Info)
141+
.parse_default_env()
135142
.init();
136143
let cli = Cli::parse();
137144

@@ -258,6 +265,7 @@ async fn main() -> Result<()> {
258265
time_config.clone(),
259266
cli.min_pages_per_day,
260267
min_time_per_day,
268+
cli.include_all_stats,
261269
);
262270

263271
// Generate initial site
@@ -283,6 +291,7 @@ async fn main() -> Result<()> {
283291
time_config.clone(),
284292
cli.min_pages_per_day,
285293
min_time_per_day,
294+
cli.include_all_stats,
286295
).await?;
287296

288297
// Run file watcher
@@ -302,6 +311,7 @@ async fn main() -> Result<()> {
302311
time_config.clone(),
303312
cli.min_pages_per_day,
304313
min_time_per_day,
314+
cli.include_all_stats,
305315
).await?;
306316

307317
// Start web server

src/site_generator.rs

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use crate::models::*;
22
use crate::templates::*;
33
use crate::statistics_parser::StatisticsParser;
4-
use crate::statistics::{BookStatistics};
4+
use crate::statistics::{BookStatistics, StatisticsCalculator};
55
use crate::book_scanner::{scan_books, MetadataLocation};
66
use anyhow::{Result, Context};
77
use askama::Template;
@@ -28,6 +28,7 @@ pub struct SiteGenerator {
2828
time_config: TimeConfig,
2929
min_pages_per_day: Option<u32>,
3030
min_time_per_day: Option<u32>,
31+
include_all_stats: bool,
3132
}
3233

3334
impl SiteGenerator {
@@ -42,6 +43,7 @@ impl SiteGenerator {
4243
time_config: TimeConfig,
4344
min_pages_per_day: Option<u32>,
4445
min_time_per_day: Option<u32>,
46+
include_all_stats: bool,
4547
) -> Self {
4648
Self {
4749
output_dir,
@@ -54,6 +56,7 @@ impl SiteGenerator {
5456
time_config,
5557
min_pages_per_day,
5658
min_time_per_day,
59+
include_all_stats,
5760
}
5861
}
5962

@@ -74,10 +77,11 @@ impl SiteGenerator {
7477
info!("Generating static site in: {:?}", self.output_dir);
7578

7679
// Scan books if books_path is provided
77-
let books = if let Some(ref books_path) = self.books_path {
80+
// Also returns the set of MD5 hashes for all books (for statistics filtering)
81+
let (books, library_md5s) = if let Some(ref books_path) = self.books_path {
7882
scan_books(books_path, &self.metadata_location).await?
7983
} else {
80-
Vec::new()
84+
(Vec::new(), std::collections::HashSet::new())
8185
};
8286

8387
// After loading statistics if path is provided
@@ -95,6 +99,12 @@ impl SiteGenerator {
9599
);
96100
}
97101

102+
// Filter statistics to library books only (unless --include-all-stats is set)
103+
// This is skipped if no books_path is provided (can't filter without a library)
104+
if !self.include_all_stats && !books.is_empty() {
105+
StatisticsCalculator::filter_to_library(&mut data, &library_md5s);
106+
}
107+
98108
crate::statistics::StatisticsCalculator::populate_completions(&mut data, &self.time_config);
99109
Some(data)
100110
} else {

src/statistics.rs

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use chrono::{NaiveDate, Duration, Datelike};
2-
use std::collections::HashMap;
2+
use std::collections::{HashMap, HashSet};
3+
use log::debug;
34

45
use crate::models::*;
56
use crate::read_completion_analyzer::{ReadCompletionDetector, CompletionConfig};
@@ -391,6 +392,36 @@ impl StatisticsCalculator {
391392
*valid_book_dates.get(&key).unwrap_or(&false)
392393
});
393394
}
395+
396+
/// Filter statistics to only include books present in the library.
397+
/// The library_md5s set contains MD5 hashes of books in the scanned library.
398+
/// This filters out statistics for deleted books or books in other directories.
399+
pub fn filter_to_library(stats_data: &mut StatisticsData, library_md5s: &HashSet<String>) {
400+
let original_count = stats_data.books.len();
401+
402+
// 1. Get IDs of books to keep (those whose MD5 is in library_md5s)
403+
let mut ids_to_keep: HashSet<i64> = HashSet::new();
404+
stats_data.books.retain(|book| {
405+
if library_md5s.contains(&book.md5) {
406+
ids_to_keep.insert(book.id);
407+
true
408+
} else {
409+
debug!("Filtering out statistics for book not in library: '{}' by {} (md5: {})",
410+
book.title, book.authors, book.md5);
411+
false
412+
}
413+
});
414+
415+
// 2. Filter page_stats to only include entries for kept books
416+
stats_data.page_stats.retain(|stat| ids_to_keep.contains(&stat.id_book));
417+
418+
// 3. Update stats_by_md5 map to only include kept books
419+
stats_data.stats_by_md5.retain(|md5, _| library_md5s.contains(md5));
420+
421+
let filtered_count = original_count - stats_data.books.len();
422+
log::info!("Filtered statistics to {} books present in library ({} excluded)",
423+
stats_data.books.len(), filtered_count);
424+
}
394425
}
395426

396427

0 commit comments

Comments
 (0)