From 1d0951bb680f2b0903fc39199b4fdbc3c104480f Mon Sep 17 00:00:00 2001 From: ROHANREDDYGADE Date: Wed, 23 Apr 2025 12:22:58 +0530 Subject: [PATCH 1/8] Normalized Urls for Yahoo Search Engine No more redirect Urls --- Cargo.lock | 7 +++++++ Cargo.toml | 1 + src/engines/yahoo.rs | 28 +++++++++++++++++++++++++--- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cc99873d..7c5cc601 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4804,6 +4804,12 @@ dependencies = [ "percent-encoding 2.3.1", ] +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + [[package]] name = "utf-8" version = "0.7.6" @@ -5058,6 +5064,7 @@ dependencies = [ "tempfile", "thesaurus", "tokio 1.44.2", + "urlencoding", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index f3e39745..34b2f3b7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ reqwest = { version = "0.12.5", default-features = false, features = [ "http2", "socks", ] } +urlencoding = "2.1.3" tokio = { version = "1.43.1", features = [ "rt-multi-thread", "macros", diff --git a/src/engines/yahoo.rs b/src/engines/yahoo.rs index e945e009..ea5d3952 100644 --- a/src/engines/yahoo.rs +++ b/src/engines/yahoo.rs @@ -56,6 +56,27 @@ impl Yahoo { // Ok(final_url) // } } +fn parse_yahoo_redirect_url(raw_url: &str) -> String { + // Look for the /RU= marker + if let Some(start_idx) = raw_url.find("/RU=") { + let encoded_start = &raw_url[start_idx + 4..]; // skip "/RU=" + let end_markers = ["/RS", "/RK"]; + let end_idx = end_markers + .iter() + .filter_map(|marker| encoded_start.find(marker)) + .min() + .unwrap_or(encoded_start.len()); + + let encoded_url = &encoded_start[..end_idx]; + + match urlencoding::decode(encoded_url) { + Ok(decoded) => decoded.into_owned(), + Err(_) => raw_url.to_string(), // fallback + } + } else { + raw_url.to_string() + } +} #[async_trait::async_trait] impl SearchEngine for Yahoo { @@ -107,11 +128,12 @@ impl SearchEngine for Yahoo { .unwrap_or("No Title Found") .trim() .to_owned(); - let cleaned_url = url + let raw_url = url .value() .attr("href") - .unwrap_or("No Link Found") - .to_owned(); + .unwrap_or("No Link Found"); + + let cleaned_url = parse_yahoo_redirect_url(raw_url); let cleaned_description = desc.inner_html().trim().to_owned(); Some(SearchResult::new( From d2a0ab456d38e36db212aa9ae2bfa11e7402b988 Mon Sep 17 00:00:00 2001 From: ROHANREDDYGADE Date: Mon, 28 Apr 2025 17:17:12 +0530 Subject: [PATCH 2/8] removed decoder dependency --- Cargo.lock | 7 ------- Cargo.toml | 1 - src/engines/yahoo.rs | 16 ++++++++++++++-- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aa152050..422daf8f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4795,12 +4795,6 @@ dependencies = [ "percent-encoding 2.3.1", ] -[[package]] -name = "urlencoding" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" - [[package]] name = "utf-8" version = "0.7.6" @@ -5056,7 +5050,6 @@ dependencies = [ "tempfile", "thesaurus", "tokio 1.44.2", - "urlencoding", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 90c1a420..f9f07560 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,6 @@ reqwest = { version = "0.12.5", default-features = false, features = [ "http2", "socks", ] } -urlencoding = "2.1.3" tokio = { version = "1.43.1", features = [ "rt-multi-thread", "macros", diff --git a/src/engines/yahoo.rs b/src/engines/yahoo.rs index e8f970a2..8abcdfce 100644 --- a/src/engines/yahoo.rs +++ b/src/engines/yahoo.rs @@ -69,8 +69,9 @@ fn parse_yahoo_redirect_url(raw_url: &str) -> String { let encoded_url = &encoded_start[..end_idx]; - match urlencoding::decode(encoded_url) { - Ok(decoded) => decoded.into_owned(), + // Manual URL decode using url::form_urlencoded + match percent_decode(encoded_url.as_bytes()) { + Ok(decoded) => decoded, Err(_) => raw_url.to_string(), // fallback } } else { @@ -78,6 +79,17 @@ fn parse_yahoo_redirect_url(raw_url: &str) -> String { } } +/// Perform a percent-decoding manually using basic Rust stdlib +fn percent_decode(input: &[u8]) -> Result { + use std::borrow::Cow; + let decoded = url::percent_encoding::percent_decode(input).decode_utf8_lossy(); + match decoded { + Cow::Borrowed(s) => Ok(s.to_string()), + Cow::Owned(s) => Ok(s), + } +} + + #[async_trait::async_trait] impl SearchEngine for Yahoo { async fn results( From c76d7fc57ddb570d0e4eaa7463011c3e8330d005 Mon Sep 17 00:00:00 2001 From: ROHANREDDYGADE Date: Tue, 29 Apr 2025 12:04:13 +0530 Subject: [PATCH 3/8] decoding of url is being done manually --- src/engines/yahoo.rs | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/src/engines/yahoo.rs b/src/engines/yahoo.rs index 8abcdfce..8874a06e 100644 --- a/src/engines/yahoo.rs +++ b/src/engines/yahoo.rs @@ -78,18 +78,46 @@ fn parse_yahoo_redirect_url(raw_url: &str) -> String { raw_url.to_string() } } - -/// Perform a percent-decoding manually using basic Rust stdlib +/// Perform a percent-decoding using only the Rust standard library. fn percent_decode(input: &[u8]) -> Result { - use std::borrow::Cow; - let decoded = url::percent_encoding::percent_decode(input).decode_utf8_lossy(); - match decoded { - Cow::Borrowed(s) => Ok(s.to_string()), - Cow::Owned(s) => Ok(s), + let mut output = Vec::with_capacity(input.len()); + let mut i = 0; + + while i < input.len() { + match input[i] { + b'%' if i + 2 < input.len() => { + if let (Some(h), Some(l)) = (from_hex(input[i + 1]), from_hex(input[i + 2])) { + output.push(h * 16 + l); + i += 3; + } else { + // Invalid percent-encoding, keep literal + output.push(input[i]); + i += 1; + } + } + b => { + output.push(b); + i += 1; + } + } + } + + String::from_utf8(output) +} + +/// Convert a single ASCII hex character to its value. +fn from_hex(byte: u8) -> Option { + match byte { + b'0'..=b'9' => Some(byte - b'0'), + b'a'..=b'f' => Some(byte - b'a' + 10), + b'A'..=b'F' => Some(byte - b'A' + 10), + _ => None, } } + + #[async_trait::async_trait] impl SearchEngine for Yahoo { async fn results( From ff01b0fc5d494186c4e01bdca4c03a6ac42d2699 Mon Sep 17 00:00:00 2001 From: ROHANREDDYGADE Date: Tue, 29 Apr 2025 17:16:22 +0530 Subject: [PATCH 4/8] return type error --- src/engines/yahoo.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/engines/yahoo.rs b/src/engines/yahoo.rs index 8874a06e..b4aea8d0 100644 --- a/src/engines/yahoo.rs +++ b/src/engines/yahoo.rs @@ -79,7 +79,8 @@ fn parse_yahoo_redirect_url(raw_url: &str) -> String { } } /// Perform a percent-decoding using only the Rust standard library. -fn percent_decode(input: &[u8]) -> Result { + +fn percent_decode(input: &[u8]) -> Result> { let mut output = Vec::with_capacity(input.len()); let mut i = 0; @@ -102,9 +103,11 @@ fn percent_decode(input: &[u8]) -> Result { } } - String::from_utf8(output) + // Wrap the FromUtf8Error into a Report if there is an error + String::from_utf8(output).map_err(|e| Report::new(e)) } + /// Convert a single ASCII hex character to its value. fn from_hex(byte: u8) -> Option { match byte { From ea1caa7564b4f59b6bf690b07d178fdd73b765d5 Mon Sep 17 00:00:00 2001 From: ROHANREDDYGADE Date: Tue, 29 Apr 2025 18:11:15 +0530 Subject: [PATCH 5/8] yahoo urls normalized --- src/engines/yahoo.rs | 57 +++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/src/engines/yahoo.rs b/src/engines/yahoo.rs index b4aea8d0..789307e3 100644 --- a/src/engines/yahoo.rs +++ b/src/engines/yahoo.rs @@ -1,24 +1,21 @@ //! The `yahoo` module handles the scraping of results from the yahoo search engine //! by querying the upstream yahoo search engine with user provided query and with a page +use error_stack::{Report, Result as StackResult, ResultExt}; + use std::collections::HashMap; use reqwest::header::HeaderMap; - -// use reqwest::{Client, Error}; - use reqwest::Client; - use scraper::Html; use crate::models::aggregation::SearchResult; - use crate::models::engine::{EngineError, SearchEngine}; -use error_stack::{Report, Result, ResultExt}; - use super::search_result_parser::SearchResultParser; +// Removed unused import: std::string::FromUtf8Error + /// A new Yahoo engine type defined in-order to implement the `SearchEngine` trait which allows to /// reduce code duplication as well as allows to create vector of different search engines easily. pub struct Yahoo { @@ -30,7 +27,7 @@ pub struct Yahoo { impl Yahoo { /// Creates the Yahoo parser. - pub fn new() -> Result { + pub fn new() -> StackResult { Ok(Self { parser: SearchResultParser::new( ".compNoResult", @@ -38,10 +35,11 @@ impl Yahoo { "h3.title a", "h3 a", ".compText", - )?, - // client: Client::new(), + ) + .change_context(EngineError::UnexpectedError)?, }) } + //TODO: Function not implemented yet // // Function to fetch the final destination URL after handling redirects @@ -56,6 +54,7 @@ impl Yahoo { // Ok(final_url) // } } + fn parse_yahoo_redirect_url(raw_url: &str) -> String { // Look for the /RU= marker if let Some(start_idx) = raw_url.find("/RU=") { @@ -78,9 +77,12 @@ fn parse_yahoo_redirect_url(raw_url: &str) -> String { raw_url.to_string() } } + /// Perform a percent-decoding using only the Rust standard library. +// use error_stack::{Report, Result}; -fn percent_decode(input: &[u8]) -> Result> { +/// Perform percent-decoding using only the Rust standard library +fn percent_decode(input: &[u8]) -> Result> { let mut output = Vec::with_capacity(input.len()); let mut i = 0; @@ -91,7 +93,6 @@ fn percent_decode(input: &[u8]) -> Result Result Option { @@ -118,9 +121,6 @@ fn from_hex(byte: u8) -> Option { } } - - - #[async_trait::async_trait] impl SearchEngine for Yahoo { async fn results( @@ -130,9 +130,7 @@ impl SearchEngine for Yahoo { user_agent: &str, client: &Client, _safe_search: u8, - ) -> Result, EngineError> { - // Page number can be missing or empty string and so appropriate handling is required - // so that upstream server recieves valid page number. + ) -> StackResult, EngineError> { let url: String = if page == 0 { format!("https://search.yahoo.com/search/?p={}", query) } else { @@ -143,7 +141,6 @@ impl SearchEngine for Yahoo { ) }; - // initializing HeaderMap and adding appropriate headers. let header_map = HeaderMap::try_from(&HashMap::from([ ("User-Agent".to_string(), user_agent.to_string()), ("Referer".to_string(), "https://google.com/".to_string()), @@ -155,9 +152,11 @@ impl SearchEngine for Yahoo { ])) .change_context(EngineError::UnexpectedError)?; - let document: Html = Html::parse_document( - &Yahoo::fetch_html_from_upstream(self, &url, header_map, client).await?, - ); + let html_str = Yahoo::fetch_html_from_upstream(self, &url, header_map, client) + .await + .change_context(EngineError::UnexpectedError)?; + + let document: Html = Html::parse_document(&html_str); if self.parser.parse_for_no_results(&document).next().is_some() { return Err(Report::new(EngineError::EmptyResultSet)); @@ -165,20 +164,17 @@ impl SearchEngine for Yahoo { self.parser .parse_for_results(&document, |title, url, desc| { - // Scrape the HTML to extract and clean the data. let cleaned_title = title .attr("aria-label") .unwrap_or("No Title Found") .trim() .to_owned(); - let raw_url = url - .value() - .attr("href") - .unwrap_or("No Link Found"); - + + let raw_url = url.value().attr("href").unwrap_or("No Link Found"); let cleaned_url = parse_yahoo_redirect_url(raw_url); let cleaned_description = desc.inner_html().trim().to_owned(); + Some(SearchResult::new( &cleaned_title, &cleaned_url, @@ -186,5 +182,6 @@ impl SearchEngine for Yahoo { &["yahoo"], )) }) + .change_context(EngineError::UnexpectedError) } -} +} \ No newline at end of file From ddb1ae5ac86efe45faa120667771b2f5991d2de8 Mon Sep 17 00:00:00 2001 From: ROHANREDDYGADE Date: Wed, 30 Apr 2025 10:15:37 +0530 Subject: [PATCH 6/8] trigger merge check From 86aba6bf641bf60390d68f78c2a526b672ea657b Mon Sep 17 00:00:00 2001 From: ROHANREDDYGADE Date: Wed, 30 Apr 2025 16:20:41 +0530 Subject: [PATCH 7/8] fixed the format issue --- src/engines/yahoo.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/engines/yahoo.rs b/src/engines/yahoo.rs index 789307e3..c10fffe9 100644 --- a/src/engines/yahoo.rs +++ b/src/engines/yahoo.rs @@ -5,8 +5,8 @@ use error_stack::{Report, Result as StackResult, ResultExt}; use std::collections::HashMap; -use reqwest::header::HeaderMap; use reqwest::Client; +use reqwest::header::HeaderMap; use scraper::Html; use crate::models::aggregation::SearchResult; @@ -184,4 +184,4 @@ impl SearchEngine for Yahoo { }) .change_context(EngineError::UnexpectedError) } -} \ No newline at end of file +} From c5af448541b6b710b9c253752285ad135512a522 Mon Sep 17 00:00:00 2001 From: ROHANREDDYGADE Date: Mon, 5 May 2025 10:45:00 +0530 Subject: [PATCH 8/8] resolved clippy and format checks --- src/engines/yahoo.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/engines/yahoo.rs b/src/engines/yahoo.rs index c10fffe9..6c86eef4 100644 --- a/src/engines/yahoo.rs +++ b/src/engines/yahoo.rs @@ -14,8 +14,6 @@ use crate::models::engine::{EngineError, SearchEngine}; use super::search_result_parser::SearchResultParser; -// Removed unused import: std::string::FromUtf8Error - /// A new Yahoo engine type defined in-order to implement the `SearchEngine` trait which allows to /// reduce code duplication as well as allows to create vector of different search engines easily. pub struct Yahoo { @@ -54,7 +52,7 @@ impl Yahoo { // Ok(final_url) // } } - +/// Parses the Yahoo redirect URL and extracts the actual target URL. fn parse_yahoo_redirect_url(raw_url: &str) -> String { // Look for the /RU= marker if let Some(start_idx) = raw_url.find("/RU=") { @@ -80,7 +78,6 @@ fn parse_yahoo_redirect_url(raw_url: &str) -> String { /// Perform a percent-decoding using only the Rust standard library. // use error_stack::{Report, Result}; - /// Perform percent-decoding using only the Rust standard library fn percent_decode(input: &[u8]) -> Result> { let mut output = Vec::with_capacity(input.len());