Open
Description
Hi,
I am trying to get a summary text for a given webpage but the returned summary is short / bad and I cant figure out why....
Here are my steps:
- Use needle to get the url html data
- Run the data through readability so only the core of the html is extracted.
- Sanitize the html to only return the text.
- Run summarize...
Why am I getting one or two lines of summary ? Can I control how long the summary output should be? I want the output to be something like http://smmry.com
Here is the code:
var needle = require('needle');
var read = require('node-readability');
var sanitizeHtml = require('sanitize-html');
var SummaryTool = require('node-summary');
var url = " http://www.inc.com/gene-marks/the-one-way-to-tell-if-you-re-a-successful-entrepreneur.html?cid=sf01001";
needle.get(url, function(error, response) {
if (!error && response.statusCode == 200){
read(response.body, function(err, article, meta) {
var cleanCont = sanitizeHtml(article.content, {
allowedTags: [ ],
allowedAttributes: {}
});
SummaryTool.summarize(article.title, cleanCont, function(err, summary) {
if(err) console.log("Something went wrong man!");
console.log(summary);
console.log("Original Length " + (article.title.length + cleanCont.length));
console.log("Summary Length " + summary.length);
console.log("Summary Ratio: " + (100 - (100 * (summary.length / (article.title.length + cleanCont.length)))));
});
});
}
});
Thanks
Chris
Metadata
Metadata
Assignees
Labels
No labels