@@ -20,6 +20,7 @@ const ANNA_ARCHIVE_BASE_URL = 'https://annas-archive.gl';
2020const ANNA_ARCHIVE_BROWSER_USER_AGENT =
2121 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' ;
2222const ANNA_LIBGEN_ADS_BASE_URL = 'https://libgen.li/ads.php' ;
23+ const ANNA_MAX_FILTERED_SEARCH_PAGES = 5 ;
2324
2425const annaLibgenGetLinkRegex = / h r e f = " ( g e t \. p h p \? m d 5 = [ ^ " ] + ) " / i;
2526
@@ -45,6 +46,31 @@ interface AnnaMetaInformation {
4546 sourceFamily : string | null ;
4647}
4748
49+ const ANNA_LANGUAGE_FILTER_CODES : Record < string , string > = {
50+ english : 'en' ,
51+ german : 'de' ,
52+ french : 'fr' ,
53+ spanish : 'es' ,
54+ en : 'en' ,
55+ de : 'de' ,
56+ fr : 'fr' ,
57+ es : 'es'
58+ } ;
59+
60+ const ANNA_LANGUAGE_ALIASES : Record < string , string [ ] > = {
61+ english : [ 'english' , 'en' , 'eng' ] ,
62+ german : [ 'german' , 'de' , 'deu' , 'ger' ] ,
63+ french : [ 'french' , 'fr' , 'fra' , 'fre' ] ,
64+ spanish : [ 'spanish' , 'es' , 'spa' ]
65+ } ;
66+
67+ const ANNA_LANGUAGE_QUERY_HINTS : Record < string , string [ ] > = {
68+ english : [ 'english' ] ,
69+ german : [ 'deutsch' ] ,
70+ french : [ 'francais' , 'french' ] ,
71+ spanish : [ 'espanol' , 'spanish' ]
72+ } ;
73+
4874function isValidCodePoint ( codePoint : number ) : boolean {
4975 return (
5076 Number . isFinite ( codePoint ) &&
@@ -123,6 +149,10 @@ function normalizeLanguageToken(value: string): string {
123149 return value . trim ( ) . toLowerCase ( ) ;
124150}
125151
152+ function normalizeExtensionToken ( value : string ) : string {
153+ return value . trim ( ) . toLowerCase ( ) ;
154+ }
155+
126156function languageFilterTokens ( input : SearchBooksRequest ) : Set < string > {
127157 return new Set (
128158 ( input . filters ?. language ?? [ ] )
@@ -131,6 +161,90 @@ function languageFilterTokens(input: SearchBooksRequest): Set<string> {
131161 ) ;
132162}
133163
164+ function annaLanguageFilterCode ( input : SearchBooksRequest ) : string | null {
165+ const requestedLanguages = [ ...languageFilterTokens ( input ) ] ;
166+ if ( requestedLanguages . length !== 1 ) {
167+ return null ;
168+ }
169+
170+ return ANNA_LANGUAGE_FILTER_CODES [ requestedLanguages [ 0 ] ] ?? null ;
171+ }
172+
173+ function annaExtensionFilter ( input : SearchBooksRequest ) : string | null {
174+ const requestedExtensions = [ ...new Set ( ( input . filters ?. extension ?? [ ] ) . map ( normalizeExtensionToken ) ) ]
175+ . filter ( ( value ) => value . length > 0 ) ;
176+ if ( requestedExtensions . length !== 1 ) {
177+ return null ;
178+ }
179+
180+ return requestedExtensions [ 0 ] ;
181+ }
182+
183+ function annaQueryVariants ( input : SearchBooksRequest ) : string [ ] {
184+ const baseQuery = input . query . trim ( ) ;
185+ if ( ! baseQuery ) {
186+ return [ ] ;
187+ }
188+
189+ const variants = [ baseQuery ] ;
190+ const requestedLanguages = [ ...languageFilterTokens ( input ) ] ;
191+ if ( requestedLanguages . length !== 1 ) {
192+ return variants ;
193+ }
194+
195+ const matchingHints = new Set < string > ( ) ;
196+ for ( const requestedLanguage of requestedLanguages ) {
197+ for ( const [ canonicalLanguage , aliases ] of Object . entries ( ANNA_LANGUAGE_ALIASES ) ) {
198+ if ( canonicalLanguage === requestedLanguage || aliases . includes ( requestedLanguage ) ) {
199+ for ( const hint of ANNA_LANGUAGE_QUERY_HINTS [ canonicalLanguage ] ?? [ ] ) {
200+ matchingHints . add ( hint ) ;
201+ }
202+ }
203+ }
204+ }
205+
206+ for ( const hint of matchingHints ) {
207+ if ( baseQuery . toLowerCase ( ) . includes ( hint . toLowerCase ( ) ) ) {
208+ continue ;
209+ }
210+
211+ variants . push ( `${ baseQuery } ${ hint } ` ) ;
212+ }
213+
214+ return variants ;
215+ }
216+
217+ function buildAnnaSearchUrl ( input : SearchBooksRequest , page = 1 ) : string {
218+ const url = new URL ( '/search' , ANNA_ARCHIVE_BASE_URL ) ;
219+ url . searchParams . set ( 'q' , input . query ) ;
220+ url . searchParams . set ( 'content' , 'book_any' ) ;
221+
222+ const languageCode = annaLanguageFilterCode ( input ) ;
223+ if ( languageCode ) {
224+ url . searchParams . set ( 'lang' , languageCode ) ;
225+ }
226+
227+ const extension = annaExtensionFilter ( input ) ;
228+ if ( extension ) {
229+ url . searchParams . set ( 'ext' , extension ) ;
230+ }
231+
232+ if ( page > 1 ) {
233+ url . searchParams . set ( 'page' , String ( page ) ) ;
234+ }
235+
236+ return url . toString ( ) ;
237+ }
238+
239+ function shouldPaginateFilteredSearch ( input : SearchBooksRequest ) : boolean {
240+ return Boolean (
241+ ( input . filters ?. language ?. length ?? 0 ) > 0 ||
242+ ( input . filters ?. extension ?. length ?? 0 ) > 0 ||
243+ typeof input . filters ?. yearFrom === 'number' ||
244+ typeof input . filters ?. yearTo === 'number'
245+ ) ;
246+ }
247+
134248function extractMetaInformation ( meta : string ) : AnnaMetaInformation {
135249 const parts = meta
136250 . split ( ' · ' )
@@ -205,7 +319,18 @@ function matchesLanguageFilter(language: string | null, tokens: Set<string>): bo
205319 }
206320
207321 const normalized = normalizeLanguageToken ( language ) ;
208- return tokens . has ( normalized ) ;
322+ const candidates = new Set ( [ normalized ] ) ;
323+
324+ for ( const [ canonicalLanguage , aliases ] of Object . entries ( ANNA_LANGUAGE_ALIASES ) ) {
325+ if ( canonicalLanguage === normalized || aliases . includes ( normalized ) ) {
326+ candidates . add ( canonicalLanguage ) ;
327+ for ( const alias of aliases ) {
328+ candidates . add ( alias ) ;
329+ }
330+ }
331+ }
332+
333+ return [ ...candidates ] . some ( ( candidate ) => tokens . has ( candidate ) ) ;
209334}
210335
211336function matchesExtensionFilter ( format : string | null , input : SearchBooksRequest ) : boolean {
@@ -302,46 +427,93 @@ export class AnnaArchiveSearchProvider implements SearchProviderPort, SearchProv
302427 ) : Promise < ApiResult < SearchResultBook [ ] > > {
303428 const limit = Math . max ( 1 , Math . min ( input . filters ?. limitPerProvider ?? 20 , 50 ) ) ;
304429 const languageTokens = languageFilterTokens ( input ) ;
305- const searchUrl = `${ ANNA_ARCHIVE_BASE_URL } /search?q=${ encodeURIComponent ( input . query ) } &content=book_any` ;
430+ const maxPages = shouldPaginateFilteredSearch ( input ) ? ANNA_MAX_FILTERED_SEARCH_PAGES : 1 ;
431+ const queryVariants = annaQueryVariants ( input ) ;
432+ let firstPageError : ApiResult < SearchResultBook [ ] > | null = null ;
306433
307434 try {
308- const response = await fetch ( searchUrl , {
309- headers : {
310- Accept : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ,
311- 'User-Agent' : ANNA_ARCHIVE_BROWSER_USER_AGENT
435+ for ( const query of queryVariants ) {
436+ const books : SearchResultBook [ ] = [ ] ;
437+ const seenHashes = new Set < string > ( ) ;
438+
439+ for ( let page = 1 ; page <= maxPages && books . length < limit ; page += 1 ) {
440+ const searchUrl = buildAnnaSearchUrl ( { ...input , query } , page ) ;
441+
442+ try {
443+ const response = await fetch ( searchUrl , {
444+ headers : {
445+ Accept : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ,
446+ 'User-Agent' : ANNA_ARCHIVE_BROWSER_USER_AGENT
447+ }
448+ } ) ;
449+
450+ if ( ! response . ok ) {
451+ const errorResult = apiError (
452+ `Anna search failed with status ${ response . status } ` ,
453+ response . status
454+ ) ;
455+ if ( page === 1 && query === queryVariants [ 0 ] ) {
456+ return errorResult ;
457+ }
458+ continue ;
459+ }
460+
461+ const html = await response . text ( ) ;
462+ if ( html . includes ( 'DDoS-Guard' ) ) {
463+ const errorResult = apiError ( 'Anna search was blocked by browser verification' , 502 ) ;
464+ if ( page === 1 && query === queryVariants [ 0 ] ) {
465+ return errorResult ;
466+ }
467+ continue ;
468+ }
469+
470+ const matches = [ ...html . matchAll ( resultAnchorRegex ) ] ;
471+ if ( matches . length === 0 ) {
472+ if ( page === 1 ) {
473+ break ;
474+ }
475+ continue ;
476+ }
477+
478+ for ( let index = 0 ; index < matches . length ; index += 1 ) {
479+ if ( books . length >= limit ) {
480+ break ;
481+ }
482+
483+ const match = matches [ index ] ;
484+ const nextMatch = matches [ index + 1 ] ;
485+ const hash = match [ 1 ] ;
486+ if ( seenHashes . has ( hash ) ) {
487+ continue ;
488+ }
489+
490+ const start = match . index ?? 0 ;
491+ const end = nextMatch ?. index ?? html . length ;
492+ const segment = html . slice ( start , end ) ;
493+ const book = mapBook ( segment , hash , input , languageTokens ) ;
494+ if ( book ) {
495+ seenHashes . add ( hash ) ;
496+ books . push ( book ) ;
497+ }
498+ }
499+ } catch ( cause : unknown ) {
500+ if ( page === 1 && query === queryVariants [ 0 ] ) {
501+ firstPageError = apiError ( 'Anna search failed' , 502 , cause ) ;
502+ break ;
503+ }
504+ }
312505 }
313- } ) ;
314-
315- if ( ! response . ok ) {
316- return apiError ( `Anna search failed with status ${ response . status } ` , response . status ) ;
317- }
318-
319- const html = await response . text ( ) ;
320- if ( html . includes ( 'DDoS-Guard' ) ) {
321- return apiError ( 'Anna search was blocked by browser verification' , 502 ) ;
322- }
323506
324- const matches = [ ...html . matchAll ( resultAnchorRegex ) ] ;
325- const books : SearchResultBook [ ] = [ ] ;
326-
327- for ( let index = 0 ; index < matches . length ; index += 1 ) {
328- if ( books . length >= limit ) {
329- break ;
507+ if ( books . length > 0 ) {
508+ return apiOk ( books ) ;
330509 }
510+ }
331511
332- const match = matches [ index ] ;
333- const nextMatch = matches [ index + 1 ] ;
334- const hash = match [ 1 ] ;
335- const start = match . index ?? 0 ;
336- const end = nextMatch ?. index ?? html . length ;
337- const segment = html . slice ( start , end ) ;
338- const book = mapBook ( segment , hash , input , languageTokens ) ;
339- if ( book ) {
340- books . push ( book ) ;
341- }
512+ if ( firstPageError ) {
513+ return firstPageError ;
342514 }
343515
344- return apiOk ( books ) ;
516+ return apiOk ( [ ] ) ;
345517 } catch ( cause : unknown ) {
346518 return apiError ( 'Anna search failed' , 502 , cause ) ;
347519 }
0 commit comments