Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ url = "2.4"
html5ever = "0.26"
markup5ever_rcdom = "0.2"
lazy_static = "1.4"
log = "0.4.22"

[dependencies.reqwest]
version = "0.11"
Expand Down
15 changes: 15 additions & 0 deletions data/comment.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<!DOCTYPE html>
<html>

<head>
<title>This is title</title>
</head>

<body>
<div class="comment">
<div>My div with more than 25 characters.<p>My paragraph with more than 25 characters.</p>
</div>
</div>
</body>

</html>
2 changes: 1 addition & 1 deletion src/dom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ pub fn find_node(handle: Handle, tag_name: &str, nodes: &mut Vec<Rc<Node>>) {
}
}

pub fn has_nodes(handle: Handle, tag_names: &Vec<&'static str>) -> bool {
pub fn has_nodes(handle: Handle, tag_names: &[&str]) -> bool {
for child in handle.children.borrow().iter() {
let tag_name: &str = &get_tag_name(child.clone()).unwrap_or_default();
if tag_names.iter().any(|&n| n == tag_name) {
Expand Down
22 changes: 18 additions & 4 deletions src/extractor.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
use crate::scorer::{Scorer, DEFAULT_SCORER};
use dom;
use error::Error;
use html5ever::tendril::stream::TendrilSink;
use html5ever::{parse_document, serialize};
use log::debug;
use markup5ever_rcdom::{RcDom, SerializableHandle};
#[cfg(feature = "reqwest")]
use reqwest;
Expand Down Expand Up @@ -37,7 +39,8 @@ pub fn scrape(url: &str) -> Result<Product, Error> {
}
}

pub fn extract<R>(input: &mut R, url: &Url) -> Result<Product, Error>
/// Extract text with a custom [`Scorer`].
pub fn extract_with_scorer<R>(input: &mut R, url: &Url, scorer: &Scorer) -> Result<Product, Error>
where
R: Read,
{
Expand All @@ -48,8 +51,11 @@ where
let mut candidates = BTreeMap::new();
let mut nodes = BTreeMap::new();
let handle = dom.document.clone();
scorer::preprocess(&mut dom, handle.clone(), &mut title);
scorer::find_candidates(Path::new("/"), handle.clone(), &mut candidates, &mut nodes);
scorer.preprocess(&mut dom, handle.clone(), &mut title);
scorer.find_candidates(Path::new("/"), handle.clone(), &mut candidates, &mut nodes);

debug!("Found candidates: {}", candidates.values().len());

let mut id: &str = "/";
let mut top_candidate: &Candidate = &Candidate {
node: handle.clone(),
Expand All @@ -67,7 +73,7 @@ where
let mut bytes = vec![];

let node = top_candidate.node.clone();
scorer::clean(&mut dom, Path::new(id), node.clone(), url, &candidates);
scorer.clean(&mut dom, Path::new(id), node.clone(), url, &candidates);

serialize(
&mut bytes,
Expand All @@ -85,3 +91,11 @@ where
text,
})
}

/// Extract text with the default [`Scorer`].
pub fn extract<R>(input: &mut R, url: &Url) -> Result<Product, Error>
where
R: Read,
{
extract_with_scorer(input, url, &DEFAULT_SCORER)
}
7 changes: 4 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
#[macro_use]
extern crate html5ever;
extern crate markup5ever_rcdom;
extern crate regex;
extern crate url;
#[macro_use]
extern crate lazy_static;
extern crate log;
extern crate markup5ever_rcdom;
extern crate regex;
#[cfg(feature = "reqwest")]
extern crate reqwest;
extern crate url;

pub mod dom;
pub mod error;
Expand Down
Loading