Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[package]
name = "readability"
edition = "2021"
version = "0.2.0"
authors = ["Hiroki Kumamoto <[email protected]>"]
license = "MIT"
Expand All @@ -12,14 +13,28 @@ categories = []
[dependencies]
regex = "1.4"
url = "2.2"
html5ever = "0.25"
markup5ever_rcdom = "0.1"
html5ever = "0.26.0"
markup5ever_rcdom = "0.2.0"
lazy_static = "1.4"

[dependencies.reqwest]
version = "0.11"
optional = true
features = ["blocking"]

[features]
default = ["reqwest"]
default = ["http-async"]
http-async = ["reqwest"]
http-blocking = ["reqwest", "reqwest/blocking"]

[dev-dependencies]
tokio = { version = "1.25.0", features = ["full"] }

[[test]]
required-features = ["http-blocking"]
name = "blocking"
path = "tests/blocking.rs"

[[test]]
required-features = ["http-async"]
name = "async"
path = "tests/async.rs"
125 changes: 58 additions & 67 deletions src/dom.rs
Original file line number Diff line number Diff line change
@@ -1,51 +1,52 @@
use std::rc::Rc;
use html5ever::tendril::StrTendril;
use html5ever::Attribute;
use markup5ever_rcdom::NodeData::{Element, Text};
use markup5ever_rcdom::{Handle, Node};
use html5ever::Attribute;
use std::rc::Rc;
use std::str::FromStr;

pub fn get_tag_name(handle: Handle) -> Option<String> {
match handle.data {
Element { ref name, .. } => Some(name.local.as_ref().to_lowercase().to_string()),
Element { ref name, .. } => Some(name.local.as_ref().to_lowercase()),
_ => None,
}
}

pub fn get_attr<'a>(name: &str, handle: Handle) -> Option<String> {
pub fn get_attr(name: &str, handle: Handle) -> Option<String> {
match handle.data {
Element { name: _, ref attrs, .. } => attr(name, &attrs.borrow()),
_ => None,
Element {
name: _, ref attrs, ..
} => attr(name, &attrs.borrow()),
_ => None,
}
}

pub fn attr(attr_name: &str, attrs: &Vec<Attribute>) -> Option<String> {
pub fn attr(attr_name: &str, attrs: &[Attribute]) -> Option<String> {
for attr in attrs.iter() {
if attr.name.local.as_ref() == attr_name {
return Some(attr.value.to_string())
return Some(attr.value.to_string());
}
}
None
}

pub fn set_attr(attr_name: &str, value: &str, handle: Handle) {
match handle.data {
Element { name: _, ref attrs, .. } => {
let attrs = &mut attrs.borrow_mut();
if let Some(index) = attrs.iter().position(|attr| {
let name = attr.name.local.as_ref();
name == attr_name
}) {
match StrTendril::from_str(value) {
Ok(value) => attrs[index] = Attribute {
name: attrs[index].name.clone(),
value: value,
},
Err(_) => (),
if let Element {
name: _, ref attrs, ..
} = handle.data
{
let attrs = &mut attrs.borrow_mut();
if let Some(index) = attrs.iter().position(|attr| {
let name = attr.name.local.as_ref();
name == attr_name
}) {
if let Ok(value) = StrTendril::from_str(value) {
attrs[index] = Attribute {
name: attrs[index].name.clone(),
value,
}
}
}
_ => (),
}
}

Expand All @@ -64,54 +65,52 @@ pub fn is_empty(handle: Handle) -> bool {
match c.data {
Text { ref contents } => {
if contents.borrow().trim().len() > 0 {
return false
return false;
}
},
}
Element { ref name, .. } => {
let tag_name = name.local.as_ref();
match tag_name.to_lowercase().as_ref() {
"li" | "dt" | "dd" | "p" | "div" => {
if !is_empty(child.clone()) {
return false
return false;
}
},
}
_ => return false,
}
},
_ => ()
}
_ => (),
}
}
match get_tag_name(handle.clone()).unwrap_or_default().as_ref() {
"li" | "dt" | "dd" | "p" | "div" | "canvas" => true,
_ => false,
}
matches!(
get_tag_name(handle).unwrap_or_default().as_ref(),
"li" | "dt" | "dd" | "p" | "div" | "canvas"
)
}

pub fn has_link(handle: Handle) -> bool {
if "a" == &get_tag_name(handle.clone()).unwrap_or_default() {
return true
return true;
}
for child in handle.children.borrow().iter() {
if has_link(child.clone()) {
return true
return true;
}
}
return false
false
}

pub fn extract_text(handle: Handle, text: &mut String, deep: bool) {
for child in handle.children.borrow().iter() {
let c = child.clone();
match c.data {
Text { ref contents } => {
text.push_str(contents.borrow().trim());
},
match &c.data {
Text { contents } => text.push_str(contents.borrow().trim()),
Element { .. } => {
if deep {
extract_text(child.clone(), text, deep);
}
},
_ => ()
}
_ => (),
}
}
}
Expand All @@ -123,11 +122,11 @@ pub fn text_len(handle: Handle) -> usize {
match c.data {
Text { ref contents } => {
len += contents.borrow().trim().chars().count();
},
}
Element { .. } => {
len += text_len(child.clone());
},
_ => ()
}
_ => (),
}
}
len
Expand All @@ -136,15 +135,12 @@ pub fn text_len(handle: Handle) -> usize {
pub fn find_node(handle: Handle, tag_name: &str, nodes: &mut Vec<Rc<Node>>) {
for child in handle.children.borrow().iter() {
let c = child.clone();
match c.data {
Element { ref name, .. } => {
let t = name.local.as_ref();
if t.to_lowercase() == tag_name {
nodes.push(child.clone());
};
find_node(child.clone(), tag_name, nodes)
},
_ => ()
if let Element { name, .. } = &c.data {
let t = name.local.as_ref();
if t.to_lowercase() == tag_name {
nodes.push(child.clone());
};
find_node(child.clone(), tag_name, nodes)
}
}
}
Expand All @@ -153,32 +149,27 @@ pub fn has_nodes(handle: Handle, tag_names: &Vec<&'static str>) -> bool {
for child in handle.children.borrow().iter() {
let tag_name: &str = &get_tag_name(child.clone()).unwrap_or_default();
if tag_names.iter().any(|&n| n == tag_name) {
return true
return true;
}
if match child.clone().data {
Element { .. } => {
has_nodes(child.clone(), tag_names)
},
Element { .. } => has_nodes(child.clone(), tag_names),
_ => false,
} {
return true
return true;
}
}
return false
false
}

pub fn text_children_count(handle: Handle) -> usize {
let mut count = 0;
for child in handle.children.borrow().iter() {
let c = child.clone();
match c.data {
Text { ref contents } => {
let s = contents.borrow();
if s.trim().len() >= 20 {
count += 1
}
},
_ => ()
if let Text { ref contents } = c.data {
let s = contents.borrow();
if s.trim().len() >= 20 {
count += 1
}
}
}
count
Expand Down
24 changes: 13 additions & 11 deletions src/error.rs
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
use std::fmt::{Display, Formatter, Result as FmtResult};
use std::error;
#[cfg(feature = "reqwest")]
#[cfg(any(feature = "http-async", feature = "http-blocking"))]
use reqwest;
use std::error;
use std::fmt::{Display, Formatter, Result as FmtResult};
use url;

#[derive(Debug)]
pub enum Error {
#[cfg(feature = "reqwest")]
#[cfg(any(feature = "http-async", feature = "http-blocking"))]
NetworkError(reqwest::Error),
UrlParseError(url::ParseError),
Unexpected,
HttpError(reqwest::StatusCode),
}

impl Display for Error {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
match *self {
#[cfg(feature = "reqwest")]
Error::NetworkError(ref e) => write!(f, "NetworkError: {}", e),
Error::UrlParseError(ref e) => write!(f, "UrlParseError: {}", e),
Error::Unexpected => write!(f, "UnexpectedError"),
#[cfg(any(feature = "http-async", feature = "http-blocking"))]
Error::NetworkError(ref e) => write!(f, "NetworkError: {e}"),
Error::UrlParseError(ref e) => write!(f, "UrlParseError: {e}"),
Error::HttpError(status_code) => write!(f, "Http error, status: {status_code}"),
}
}
}
Expand All @@ -29,13 +29,15 @@ impl From<url::ParseError> for Error {
}
}

#[cfg(feature = "reqwest")]
#[cfg(any(feature = "http-async", feature = "http-blocking"))]
impl From<reqwest::Error> for Error {
fn from(err: reqwest::Error) -> Error {
Error::NetworkError(err)
}
}

impl error::Error for Error {
fn description(&self) -> &str { "" }
fn description(&self) -> &str {
""
}
}
32 changes: 32 additions & 0 deletions src/extractor/blocking_client.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
use crate::dom;
use crate::error::Error;
use crate::extractor::{extract, ReadableHtmlPage};
use crate::scorer;
use crate::scorer::Candidate;
use html5ever::tendril::stream::TendrilSink;
use html5ever::{parse_document, serialize};
use markup5ever_rcdom::{RcDom, SerializableHandle};
use reqwest;
use std::cell::Cell;
use std::collections::BTreeMap;
use std::default::Default;
use std::io::Read;
use std::path::Path;
use std::time::Duration;
use url::Url;

/// Scrape the given url and return a [`ReadableHtmlPage`]
pub fn scrape(url: &str) -> Result<ReadableHtmlPage, Error> {
let client = reqwest::blocking::Client::builder()
.timeout(Duration::new(30, 0))
.user_agent(super::APP_USER_AGENT)
.build()?;

let mut res = client.get(url).send()?;
if res.status().is_success() {
let url = Url::parse(url)?;
extract(&mut res, &url)
} else {
Err(Error::HttpError)
}
}
22 changes: 22 additions & 0 deletions src/extractor/client.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
use crate::error::Error;
use crate::extractor::{extract, ReadableHtmlPage};
use std::time::Duration;
use url::Url;

/// Scrape the given url and return a [`ReadableHtmlPage`]
pub async fn scrape(url: &str) -> Result<ReadableHtmlPage, Error> {
let client = reqwest::Client::builder()
.timeout(Duration::new(30, 0))
.user_agent(super::APP_USER_AGENT)
.build()?;

let res = client.get(url).send().await?;

if res.status().is_success() {
let url = Url::parse(url)?;
let read = res.text().await?;
extract(&mut read.as_bytes(), &url)
} else {
Err(Error::HttpError(res.status()))
}
}
Loading