diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java index 21f487d5f..82455ec56 100644 --- a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java +++ b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java @@ -106,8 +106,11 @@ public class WebCrawler implements Runnable { * the id of this crawler instance * @param crawlController * the controller that manages this crawling session + * @throws IllegalAccessException + * @throws InstantiationException */ - public void init(int id, CrawlController crawlController) { + public void init(int id, CrawlController crawlController) + throws InstantiationException, IllegalAccessException { this.myId = id; this.pageFetcher = crawlController.getPageFetcher(); this.robotstxtServer = crawlController.getRobotstxtServer(); diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/AllTagMapper.java b/src/main/java/edu/uci/ics/crawler4j/parser/AllTagMapper.java new file mode 100644 index 000000000..129933e99 --- /dev/null +++ b/src/main/java/edu/uci/ics/crawler4j/parser/AllTagMapper.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.parser; + +import org.apache.tika.parser.html.HtmlMapper; + +/** + * Maps all HTML tags (not ignore some of this) + * + * @author Andrey Nikolaev (vajadhava@gmail.com) + */ +public class AllTagMapper implements HtmlMapper { + + @Override + public String mapSafeElement(String name) { + return name.toLowerCase(); + } + + @Override + public boolean isDiscardElement(String name) { + return false; + } + + @Override + public String mapSafeAttribute(String elementName, String attributeName) { + return attributeName.toLowerCase(); + } +} diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java index 54cb5d74d..a364e177d 100644 --- a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java +++ b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java @@ -40,7 +40,8 @@ private enum Element { IMG, BASE, META, - BODY + BODY, + SCRIPT } private static class HtmlFactory { @@ -97,7 +98,7 @@ public void startElement(String uri, String localName, String qName, Attributes } } else if ((element == Element.IFRAME) || (element == Element.FRAME) || - (element == Element.EMBED)) { + (element == Element.EMBED) || (element == Element.SCRIPT)) { String src = attributes.getValue("src"); if (src != null) { addToOutgoingUrls(src, localName); diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java index fbc065bde..fd7421506 100644 --- a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java +++ b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java @@ -27,7 +27,9 @@ import org.apache.tika.metadata.DublinCore; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.html.HtmlMapper; import org.apache.tika.parser.html.HtmlParser; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -50,10 +52,11 @@ public class Parser extends Configurable { private final HtmlParser htmlParser; private final ParseContext parseContext; - public Parser(CrawlConfig config) { + public Parser(CrawlConfig config) throws InstantiationException, IllegalAccessException { super(config); htmlParser = new HtmlParser(); parseContext = new ParseContext(); + parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance()); } public void parse(Page page, String contextURL) @@ -159,4 +162,4 @@ public void parse(Page page, String contextURL) } } } -} \ No newline at end of file +} diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/localdata/Downloader.java b/src/test/java/edu/uci/ics/crawler4j/examples/localdata/Downloader.java index cabe851a2..b43b16677 100644 --- a/src/test/java/edu/uci/ics/crawler4j/examples/localdata/Downloader.java +++ b/src/test/java/edu/uci/ics/crawler4j/examples/localdata/Downloader.java @@ -40,13 +40,13 @@ public class Downloader { private final Parser parser; private final PageFetcher pageFetcher; - public Downloader() { + public Downloader() throws InstantiationException, IllegalAccessException { CrawlConfig config = new CrawlConfig(); parser = new Parser(config); pageFetcher = new PageFetcher(config); } - public static void main(String[] args) { + public static void main(String[] args) throws InstantiationException, IllegalAccessException { Downloader downloader = new Downloader(); downloader.processUrl("http://en.wikipedia.org/wiki/Main_Page/"); downloader.processUrl("http://www.yahoo.com/"); diff --git a/src/test/java/edu/uci/ics/crawler4j/tests/HtmlContentHandlerTest.java b/src/test/java/edu/uci/ics/crawler4j/tests/HtmlContentHandlerTest.java index 3c2a41827..8ce2ccd93 100644 --- a/src/test/java/edu/uci/ics/crawler4j/tests/HtmlContentHandlerTest.java +++ b/src/test/java/edu/uci/ics/crawler4j/tests/HtmlContentHandlerTest.java @@ -6,9 +6,12 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.html.HtmlMapper; import org.apache.tika.parser.html.HtmlParser; import org.junit.Test; +import edu.uci.ics.crawler4j.parser.AllTagMapper; +import edu.uci.ics.crawler4j.parser.ExtractedUrlAnchorPair; import edu.uci.ics.crawler4j.parser.HtmlContentHandler; public class HtmlContentHandlerTest { @@ -20,6 +23,7 @@ private HtmlContentHandler parseHtml(String html) throws Exception { ByteArrayInputStream bais = new ByteArrayInputStream(html.getBytes()); Metadata metadata = new Metadata(); HtmlContentHandler contentHandler = new HtmlContentHandler(); + parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance()); parser.parse(bais, contentHandler, metadata, parseContext); return contentHandler; } @@ -50,4 +54,15 @@ public void testTableInBody() throws Exception { assertEquals("Hello there mr bear", parse.getBodyText()); } + @Test + public void testSciptInHead() throws Exception { + + HtmlContentHandler parse = parseHtml("
" + + "" + + ""); + + ExtractedUrlAnchorPair script = parse.getOutgoingUrls().get(0); + assertEquals("/js/app.js", script.getHref()); + } + }