1
1
//! The `yahoo` module handles the scraping of results from the yahoo search engine
2
2
//! by querying the upstream yahoo search engine with user provided query and with a page
3
3
4
- use std:: collections:: HashMap ;
5
-
6
- use reqwest:: header:: HeaderMap ;
4
+ use error_stack:: { Report , Result as StackResult , ResultExt } ;
7
5
8
- // use reqwest::{Client, Error} ;
6
+ use std :: collections :: HashMap ;
9
7
10
8
use reqwest:: Client ;
11
-
9
+ use reqwest :: header :: HeaderMap ;
12
10
use scraper:: Html ;
13
11
14
12
use crate :: models:: aggregation:: SearchResult ;
15
-
16
13
use crate :: models:: engine:: { EngineError , SearchEngine } ;
17
14
18
- use error_stack:: { Report , Result , ResultExt } ;
19
-
20
15
use super :: search_result_parser:: SearchResultParser ;
21
16
22
17
/// A new Yahoo engine type defined in-order to implement the `SearchEngine` trait which allows to
@@ -30,18 +25,19 @@ pub struct Yahoo {
30
25
31
26
impl Yahoo {
32
27
/// Creates the Yahoo parser.
33
- pub fn new ( ) -> Result < Self , EngineError > {
28
+ pub fn new ( ) -> StackResult < Self , EngineError > {
34
29
Ok ( Self {
35
30
parser : SearchResultParser :: new (
36
31
".compNoResult" ,
37
32
"div.algo" ,
38
33
"h3.title a" ,
39
34
"h3 a" ,
40
35
".compText" ,
41
- ) ? ,
42
- // client: Client::new() ,
36
+ )
37
+ . change_context ( EngineError :: UnexpectedError ) ? ,
43
38
} )
44
39
}
40
+
45
41
//TODO: Function not implemented yet
46
42
//
47
43
// Function to fetch the final destination URL after handling redirects
@@ -56,6 +52,71 @@ impl Yahoo {
56
52
// Ok(final_url)
57
53
// }
58
54
}
55
+ /// Parses the Yahoo redirect URL and extracts the actual target URL.
56
+ fn parse_yahoo_redirect_url ( raw_url : & str ) -> String {
57
+ // Look for the /RU= marker
58
+ if let Some ( start_idx) = raw_url. find ( "/RU=" ) {
59
+ let encoded_start = & raw_url[ start_idx + 4 ..] ; // skip "/RU="
60
+ let end_markers = [ "/RS" , "/RK" ] ;
61
+ let end_idx = end_markers
62
+ . iter ( )
63
+ . filter_map ( |marker| encoded_start. find ( marker) )
64
+ . min ( )
65
+ . unwrap_or ( encoded_start. len ( ) ) ;
66
+
67
+ let encoded_url = & encoded_start[ ..end_idx] ;
68
+
69
+ // Manual URL decode using url::form_urlencoded
70
+ match percent_decode ( encoded_url. as_bytes ( ) ) {
71
+ Ok ( decoded) => decoded,
72
+ Err ( _) => raw_url. to_string ( ) , // fallback
73
+ }
74
+ } else {
75
+ raw_url. to_string ( )
76
+ }
77
+ }
78
+
79
+ /// Perform a percent-decoding using only the Rust standard library.
80
+ // use error_stack::{Report, Result};
81
+ /// Perform percent-decoding using only the Rust standard library
82
+ fn percent_decode ( input : & [ u8 ] ) -> Result < String , Report < FromUtf8Error > > {
83
+ let mut output = Vec :: with_capacity ( input. len ( ) ) ;
84
+ let mut i = 0 ;
85
+
86
+ while i < input. len ( ) {
87
+ match input[ i] {
88
+ b'%' if i + 2 < input. len ( ) => {
89
+ if let ( Some ( h) , Some ( l) ) = ( from_hex ( input[ i + 1 ] ) , from_hex ( input[ i + 2 ] ) ) {
90
+ output. push ( h * 16 + l) ;
91
+ i += 3 ;
92
+ } else {
93
+ output. push ( input[ i] ) ;
94
+ i += 1 ;
95
+ }
96
+ }
97
+ b => {
98
+ output. push ( b) ;
99
+ i += 1 ;
100
+ }
101
+ }
102
+ }
103
+
104
+ // Manually handle the error conversion to Report
105
+ String :: from_utf8 ( output) . map_err ( |e| Report :: new ( e) )
106
+ }
107
+
108
+ // Need to add this import
109
+ use std:: string:: FromUtf8Error ;
110
+
111
+ /// Convert a single ASCII hex character to its value.
112
+ fn from_hex ( byte : u8 ) -> Option < u8 > {
113
+ match byte {
114
+ b'0' ..=b'9' => Some ( byte - b'0' ) ,
115
+ b'a' ..=b'f' => Some ( byte - b'a' + 10 ) ,
116
+ b'A' ..=b'F' => Some ( byte - b'A' + 10 ) ,
117
+ _ => None ,
118
+ }
119
+ }
59
120
60
121
#[ async_trait:: async_trait]
61
122
impl SearchEngine for Yahoo {
@@ -66,9 +127,7 @@ impl SearchEngine for Yahoo {
66
127
user_agent : & str ,
67
128
client : & Client ,
68
129
_safe_search : u8 ,
69
- ) -> Result < Vec < ( String , SearchResult ) > , EngineError > {
70
- // Page number can be missing or empty string and so appropriate handling is required
71
- // so that upstream server recieves valid page number.
130
+ ) -> StackResult < Vec < ( String , SearchResult ) > , EngineError > {
72
131
let url: String = if page == 0 {
73
132
format ! ( "https://search.yahoo.com/search/?p={}" , query)
74
133
} else {
@@ -79,7 +138,6 @@ impl SearchEngine for Yahoo {
79
138
)
80
139
} ;
81
140
82
- // initializing HeaderMap and adding appropriate headers.
83
141
let header_map = HeaderMap :: try_from ( & HashMap :: from ( [
84
142
( "User-Agent" . to_string ( ) , user_agent. to_string ( ) ) ,
85
143
( "Referer" . to_string ( ) , "https://google.com/" . to_string ( ) ) ,
@@ -91,35 +149,36 @@ impl SearchEngine for Yahoo {
91
149
] ) )
92
150
. change_context ( EngineError :: UnexpectedError ) ?;
93
151
94
- let document: Html = Html :: parse_document (
95
- & Yahoo :: fetch_html_from_upstream ( self , & url, header_map, client) . await ?,
96
- ) ;
152
+ let html_str = Yahoo :: fetch_html_from_upstream ( self , & url, header_map, client)
153
+ . await
154
+ . change_context ( EngineError :: UnexpectedError ) ?;
155
+
156
+ let document: Html = Html :: parse_document ( & html_str) ;
97
157
98
158
if self . parser . parse_for_no_results ( & document) . next ( ) . is_some ( ) {
99
159
return Err ( Report :: new ( EngineError :: EmptyResultSet ) ) ;
100
160
}
101
161
102
162
self . parser
103
163
. parse_for_results ( & document, |title, url, desc| {
104
- // Scrape the HTML to extract and clean the data.
105
164
let cleaned_title = title
106
165
. attr ( "aria-label" )
107
166
. unwrap_or ( "No Title Found" )
108
167
. trim ( )
109
168
. to_owned ( ) ;
110
- let cleaned_url = url
111
- . value ( )
112
- . attr ( "href" )
113
- . unwrap_or ( "No Link Found" )
114
- . to_owned ( ) ;
169
+
170
+ let raw_url = url. value ( ) . attr ( "href" ) . unwrap_or ( "No Link Found" ) ;
171
+ let cleaned_url = parse_yahoo_redirect_url ( raw_url) ;
115
172
116
173
let cleaned_description = desc. inner_html ( ) . trim ( ) . to_owned ( ) ;
174
+
117
175
Some ( SearchResult :: new (
118
176
& cleaned_title,
119
177
& cleaned_url,
120
178
& cleaned_description,
121
179
& [ "yahoo" ] ,
122
180
) )
123
181
} )
182
+ . change_context ( EngineError :: UnexpectedError )
124
183
}
125
184
}
0 commit comments