Skip to content

Commit a4df209

Browse files
authored
fix: improved search stability for annas archive (#45)
1 parent d9b2729 commit a4df209

3 files changed

Lines changed: 388 additions & 33 deletions

File tree

sake/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"packageManager": "bun@1.3.8",
77
"scripts": {
88
"dev": "vite dev",
9+
"dev:selfhosted": "bun --no-env-file --env-file=.env --env-file=.env.docker.selfhosted run dev",
910
"dev:mig": "bun run db:migrate && bun run dev",
1011
"build": "node ./scripts/run-build-with-project-env.mjs",
1112
"preview": "vite preview",

sake/src/lib/server/infrastructure/search-providers/AnnaArchiveSearchProvider.ts

Lines changed: 205 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ const ANNA_ARCHIVE_BASE_URL = 'https://annas-archive.gl';
2020
const ANNA_ARCHIVE_BROWSER_USER_AGENT =
2121
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
2222
const ANNA_LIBGEN_ADS_BASE_URL = 'https://libgen.li/ads.php';
23+
const ANNA_MAX_FILTERED_SEARCH_PAGES = 5;
2324

2425
const annaLibgenGetLinkRegex = /href="(get\.php\?md5=[^"]+)"/i;
2526

@@ -45,6 +46,31 @@ interface AnnaMetaInformation {
4546
sourceFamily: string | null;
4647
}
4748

49+
const ANNA_LANGUAGE_FILTER_CODES: Record<string, string> = {
50+
english: 'en',
51+
german: 'de',
52+
french: 'fr',
53+
spanish: 'es',
54+
en: 'en',
55+
de: 'de',
56+
fr: 'fr',
57+
es: 'es'
58+
};
59+
60+
const ANNA_LANGUAGE_ALIASES: Record<string, string[]> = {
61+
english: ['english', 'en', 'eng'],
62+
german: ['german', 'de', 'deu', 'ger'],
63+
french: ['french', 'fr', 'fra', 'fre'],
64+
spanish: ['spanish', 'es', 'spa']
65+
};
66+
67+
const ANNA_LANGUAGE_QUERY_HINTS: Record<string, string[]> = {
68+
english: ['english'],
69+
german: ['deutsch'],
70+
french: ['francais', 'french'],
71+
spanish: ['espanol', 'spanish']
72+
};
73+
4874
function isValidCodePoint(codePoint: number): boolean {
4975
return (
5076
Number.isFinite(codePoint) &&
@@ -123,6 +149,10 @@ function normalizeLanguageToken(value: string): string {
123149
return value.trim().toLowerCase();
124150
}
125151

152+
function normalizeExtensionToken(value: string): string {
153+
return value.trim().toLowerCase();
154+
}
155+
126156
function languageFilterTokens(input: SearchBooksRequest): Set<string> {
127157
return new Set(
128158
(input.filters?.language ?? [])
@@ -131,6 +161,90 @@ function languageFilterTokens(input: SearchBooksRequest): Set<string> {
131161
);
132162
}
133163

164+
function annaLanguageFilterCode(input: SearchBooksRequest): string | null {
165+
const requestedLanguages = [...languageFilterTokens(input)];
166+
if (requestedLanguages.length !== 1) {
167+
return null;
168+
}
169+
170+
return ANNA_LANGUAGE_FILTER_CODES[requestedLanguages[0]] ?? null;
171+
}
172+
173+
function annaExtensionFilter(input: SearchBooksRequest): string | null {
174+
const requestedExtensions = [...new Set((input.filters?.extension ?? []).map(normalizeExtensionToken))]
175+
.filter((value) => value.length > 0);
176+
if (requestedExtensions.length !== 1) {
177+
return null;
178+
}
179+
180+
return requestedExtensions[0];
181+
}
182+
183+
function annaQueryVariants(input: SearchBooksRequest): string[] {
184+
const baseQuery = input.query.trim();
185+
if (!baseQuery) {
186+
return [];
187+
}
188+
189+
const variants = [baseQuery];
190+
const requestedLanguages = [...languageFilterTokens(input)];
191+
if (requestedLanguages.length !== 1) {
192+
return variants;
193+
}
194+
195+
const matchingHints = new Set<string>();
196+
for (const requestedLanguage of requestedLanguages) {
197+
for (const [canonicalLanguage, aliases] of Object.entries(ANNA_LANGUAGE_ALIASES)) {
198+
if (canonicalLanguage === requestedLanguage || aliases.includes(requestedLanguage)) {
199+
for (const hint of ANNA_LANGUAGE_QUERY_HINTS[canonicalLanguage] ?? []) {
200+
matchingHints.add(hint);
201+
}
202+
}
203+
}
204+
}
205+
206+
for (const hint of matchingHints) {
207+
if (baseQuery.toLowerCase().includes(hint.toLowerCase())) {
208+
continue;
209+
}
210+
211+
variants.push(`${baseQuery} ${hint}`);
212+
}
213+
214+
return variants;
215+
}
216+
217+
function buildAnnaSearchUrl(input: SearchBooksRequest, page = 1): string {
218+
const url = new URL('/search', ANNA_ARCHIVE_BASE_URL);
219+
url.searchParams.set('q', input.query);
220+
url.searchParams.set('content', 'book_any');
221+
222+
const languageCode = annaLanguageFilterCode(input);
223+
if (languageCode) {
224+
url.searchParams.set('lang', languageCode);
225+
}
226+
227+
const extension = annaExtensionFilter(input);
228+
if (extension) {
229+
url.searchParams.set('ext', extension);
230+
}
231+
232+
if (page > 1) {
233+
url.searchParams.set('page', String(page));
234+
}
235+
236+
return url.toString();
237+
}
238+
239+
function shouldPaginateFilteredSearch(input: SearchBooksRequest): boolean {
240+
return Boolean(
241+
(input.filters?.language?.length ?? 0) > 0 ||
242+
(input.filters?.extension?.length ?? 0) > 0 ||
243+
typeof input.filters?.yearFrom === 'number' ||
244+
typeof input.filters?.yearTo === 'number'
245+
);
246+
}
247+
134248
function extractMetaInformation(meta: string): AnnaMetaInformation {
135249
const parts = meta
136250
.split(' · ')
@@ -205,7 +319,18 @@ function matchesLanguageFilter(language: string | null, tokens: Set<string>): bo
205319
}
206320

207321
const normalized = normalizeLanguageToken(language);
208-
return tokens.has(normalized);
322+
const candidates = new Set([normalized]);
323+
324+
for (const [canonicalLanguage, aliases] of Object.entries(ANNA_LANGUAGE_ALIASES)) {
325+
if (canonicalLanguage === normalized || aliases.includes(normalized)) {
326+
candidates.add(canonicalLanguage);
327+
for (const alias of aliases) {
328+
candidates.add(alias);
329+
}
330+
}
331+
}
332+
333+
return [...candidates].some((candidate) => tokens.has(candidate));
209334
}
210335

211336
function matchesExtensionFilter(format: string | null, input: SearchBooksRequest): boolean {
@@ -302,46 +427,93 @@ export class AnnaArchiveSearchProvider implements SearchProviderPort, SearchProv
302427
): Promise<ApiResult<SearchResultBook[]>> {
303428
const limit = Math.max(1, Math.min(input.filters?.limitPerProvider ?? 20, 50));
304429
const languageTokens = languageFilterTokens(input);
305-
const searchUrl = `${ANNA_ARCHIVE_BASE_URL}/search?q=${encodeURIComponent(input.query)}&content=book_any`;
430+
const maxPages = shouldPaginateFilteredSearch(input) ? ANNA_MAX_FILTERED_SEARCH_PAGES : 1;
431+
const queryVariants = annaQueryVariants(input);
432+
let firstPageError: ApiResult<SearchResultBook[]> | null = null;
306433

307434
try {
308-
const response = await fetch(searchUrl, {
309-
headers: {
310-
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
311-
'User-Agent': ANNA_ARCHIVE_BROWSER_USER_AGENT
435+
for (const query of queryVariants) {
436+
const books: SearchResultBook[] = [];
437+
const seenHashes = new Set<string>();
438+
439+
for (let page = 1; page <= maxPages && books.length < limit; page += 1) {
440+
const searchUrl = buildAnnaSearchUrl({ ...input, query }, page);
441+
442+
try {
443+
const response = await fetch(searchUrl, {
444+
headers: {
445+
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
446+
'User-Agent': ANNA_ARCHIVE_BROWSER_USER_AGENT
447+
}
448+
});
449+
450+
if (!response.ok) {
451+
const errorResult = apiError(
452+
`Anna search failed with status ${response.status}`,
453+
response.status
454+
);
455+
if (page === 1 && query === queryVariants[0]) {
456+
return errorResult;
457+
}
458+
continue;
459+
}
460+
461+
const html = await response.text();
462+
if (html.includes('DDoS-Guard')) {
463+
const errorResult = apiError('Anna search was blocked by browser verification', 502);
464+
if (page === 1 && query === queryVariants[0]) {
465+
return errorResult;
466+
}
467+
continue;
468+
}
469+
470+
const matches = [...html.matchAll(resultAnchorRegex)];
471+
if (matches.length === 0) {
472+
if (page === 1) {
473+
break;
474+
}
475+
continue;
476+
}
477+
478+
for (let index = 0; index < matches.length; index += 1) {
479+
if (books.length >= limit) {
480+
break;
481+
}
482+
483+
const match = matches[index];
484+
const nextMatch = matches[index + 1];
485+
const hash = match[1];
486+
if (seenHashes.has(hash)) {
487+
continue;
488+
}
489+
490+
const start = match.index ?? 0;
491+
const end = nextMatch?.index ?? html.length;
492+
const segment = html.slice(start, end);
493+
const book = mapBook(segment, hash, input, languageTokens);
494+
if (book) {
495+
seenHashes.add(hash);
496+
books.push(book);
497+
}
498+
}
499+
} catch (cause: unknown) {
500+
if (page === 1 && query === queryVariants[0]) {
501+
firstPageError = apiError('Anna search failed', 502, cause);
502+
break;
503+
}
504+
}
312505
}
313-
});
314-
315-
if (!response.ok) {
316-
return apiError(`Anna search failed with status ${response.status}`, response.status);
317-
}
318-
319-
const html = await response.text();
320-
if (html.includes('DDoS-Guard')) {
321-
return apiError('Anna search was blocked by browser verification', 502);
322-
}
323506

324-
const matches = [...html.matchAll(resultAnchorRegex)];
325-
const books: SearchResultBook[] = [];
326-
327-
for (let index = 0; index < matches.length; index += 1) {
328-
if (books.length >= limit) {
329-
break;
507+
if (books.length > 0) {
508+
return apiOk(books);
330509
}
510+
}
331511

332-
const match = matches[index];
333-
const nextMatch = matches[index + 1];
334-
const hash = match[1];
335-
const start = match.index ?? 0;
336-
const end = nextMatch?.index ?? html.length;
337-
const segment = html.slice(start, end);
338-
const book = mapBook(segment, hash, input, languageTokens);
339-
if (book) {
340-
books.push(book);
341-
}
512+
if (firstPageError) {
513+
return firstPageError;
342514
}
343515

344-
return apiOk(books);
516+
return apiOk([]);
345517
} catch (cause: unknown) {
346518
return apiError('Anna search failed', 502, cause);
347519
}

0 commit comments

Comments
 (0)