Skip to content

Commit c1f52d1

Browse files
authored
Merge pull request #675 from ROHANREDDYGADE/engines
🚸 Normalized urls for `yahoo` search engine to remove redirects
2 parents d577ede + b3aa33d commit c1f52d1

File tree

1 file changed

+83
-24
lines changed

1 file changed

+83
-24
lines changed

src/engines/yahoo.rs

+83-24
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,17 @@
11
//! The `yahoo` module handles the scraping of results from the yahoo search engine
22
//! by querying the upstream yahoo search engine with user provided query and with a page
33
4-
use std::collections::HashMap;
5-
6-
use reqwest::header::HeaderMap;
4+
use error_stack::{Report, Result as StackResult, ResultExt};
75

8-
// use reqwest::{Client, Error};
6+
use std::collections::HashMap;
97

108
use reqwest::Client;
11-
9+
use reqwest::header::HeaderMap;
1210
use scraper::Html;
1311

1412
use crate::models::aggregation::SearchResult;
15-
1613
use crate::models::engine::{EngineError, SearchEngine};
1714

18-
use error_stack::{Report, Result, ResultExt};
19-
2015
use super::search_result_parser::SearchResultParser;
2116

2217
/// A new Yahoo engine type defined in-order to implement the `SearchEngine` trait which allows to
@@ -30,18 +25,19 @@ pub struct Yahoo {
3025

3126
impl Yahoo {
3227
/// Creates the Yahoo parser.
33-
pub fn new() -> Result<Self, EngineError> {
28+
pub fn new() -> StackResult<Self, EngineError> {
3429
Ok(Self {
3530
parser: SearchResultParser::new(
3631
".compNoResult",
3732
"div.algo",
3833
"h3.title a",
3934
"h3 a",
4035
".compText",
41-
)?,
42-
// client: Client::new(),
36+
)
37+
.change_context(EngineError::UnexpectedError)?,
4338
})
4439
}
40+
4541
//TODO: Function not implemented yet
4642
//
4743
// Function to fetch the final destination URL after handling redirects
@@ -56,6 +52,71 @@ impl Yahoo {
5652
// Ok(final_url)
5753
// }
5854
}
55+
/// Parses the Yahoo redirect URL and extracts the actual target URL.
56+
fn parse_yahoo_redirect_url(raw_url: &str) -> String {
57+
// Look for the /RU= marker
58+
if let Some(start_idx) = raw_url.find("/RU=") {
59+
let encoded_start = &raw_url[start_idx + 4..]; // skip "/RU="
60+
let end_markers = ["/RS", "/RK"];
61+
let end_idx = end_markers
62+
.iter()
63+
.filter_map(|marker| encoded_start.find(marker))
64+
.min()
65+
.unwrap_or(encoded_start.len());
66+
67+
let encoded_url = &encoded_start[..end_idx];
68+
69+
// Manual URL decode using url::form_urlencoded
70+
match percent_decode(encoded_url.as_bytes()) {
71+
Ok(decoded) => decoded,
72+
Err(_) => raw_url.to_string(), // fallback
73+
}
74+
} else {
75+
raw_url.to_string()
76+
}
77+
}
78+
79+
/// Perform a percent-decoding using only the Rust standard library.
80+
// use error_stack::{Report, Result};
81+
/// Perform percent-decoding using only the Rust standard library
82+
fn percent_decode(input: &[u8]) -> Result<String, Report<FromUtf8Error>> {
83+
let mut output = Vec::with_capacity(input.len());
84+
let mut i = 0;
85+
86+
while i < input.len() {
87+
match input[i] {
88+
b'%' if i + 2 < input.len() => {
89+
if let (Some(h), Some(l)) = (from_hex(input[i + 1]), from_hex(input[i + 2])) {
90+
output.push(h * 16 + l);
91+
i += 3;
92+
} else {
93+
output.push(input[i]);
94+
i += 1;
95+
}
96+
}
97+
b => {
98+
output.push(b);
99+
i += 1;
100+
}
101+
}
102+
}
103+
104+
// Manually handle the error conversion to Report
105+
String::from_utf8(output).map_err(|e| Report::new(e))
106+
}
107+
108+
// Need to add this import
109+
use std::string::FromUtf8Error;
110+
111+
/// Convert a single ASCII hex character to its value.
112+
fn from_hex(byte: u8) -> Option<u8> {
113+
match byte {
114+
b'0'..=b'9' => Some(byte - b'0'),
115+
b'a'..=b'f' => Some(byte - b'a' + 10),
116+
b'A'..=b'F' => Some(byte - b'A' + 10),
117+
_ => None,
118+
}
119+
}
59120

60121
#[async_trait::async_trait]
61122
impl SearchEngine for Yahoo {
@@ -66,9 +127,7 @@ impl SearchEngine for Yahoo {
66127
user_agent: &str,
67128
client: &Client,
68129
_safe_search: u8,
69-
) -> Result<Vec<(String, SearchResult)>, EngineError> {
70-
// Page number can be missing or empty string and so appropriate handling is required
71-
// so that upstream server recieves valid page number.
130+
) -> StackResult<Vec<(String, SearchResult)>, EngineError> {
72131
let url: String = if page == 0 {
73132
format!("https://search.yahoo.com/search/?p={}", query)
74133
} else {
@@ -79,7 +138,6 @@ impl SearchEngine for Yahoo {
79138
)
80139
};
81140

82-
// initializing HeaderMap and adding appropriate headers.
83141
let header_map = HeaderMap::try_from(&HashMap::from([
84142
("User-Agent".to_string(), user_agent.to_string()),
85143
("Referer".to_string(), "https://google.com/".to_string()),
@@ -91,35 +149,36 @@ impl SearchEngine for Yahoo {
91149
]))
92150
.change_context(EngineError::UnexpectedError)?;
93151

94-
let document: Html = Html::parse_document(
95-
&Yahoo::fetch_html_from_upstream(self, &url, header_map, client).await?,
96-
);
152+
let html_str = Yahoo::fetch_html_from_upstream(self, &url, header_map, client)
153+
.await
154+
.change_context(EngineError::UnexpectedError)?;
155+
156+
let document: Html = Html::parse_document(&html_str);
97157

98158
if self.parser.parse_for_no_results(&document).next().is_some() {
99159
return Err(Report::new(EngineError::EmptyResultSet));
100160
}
101161

102162
self.parser
103163
.parse_for_results(&document, |title, url, desc| {
104-
// Scrape the HTML to extract and clean the data.
105164
let cleaned_title = title
106165
.attr("aria-label")
107166
.unwrap_or("No Title Found")
108167
.trim()
109168
.to_owned();
110-
let cleaned_url = url
111-
.value()
112-
.attr("href")
113-
.unwrap_or("No Link Found")
114-
.to_owned();
169+
170+
let raw_url = url.value().attr("href").unwrap_or("No Link Found");
171+
let cleaned_url = parse_yahoo_redirect_url(raw_url);
115172

116173
let cleaned_description = desc.inner_html().trim().to_owned();
174+
117175
Some(SearchResult::new(
118176
&cleaned_title,
119177
&cleaned_url,
120178
&cleaned_description,
121179
&["yahoo"],
122180
))
123181
})
182+
.change_context(EngineError::UnexpectedError)
124183
}
125184
}

0 commit comments

Comments
 (0)