mockSearchEngine/XmlParser.java at main · SebastienBolh/mockSearchEngine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
package finalproject;

import java.io.*;
import java.util.*;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

/* MIT LICENSE DOES NOT APPLY TO THIS FILE.
 * READ LICENSE.
 * This was given for the final. I did not write any of it.
 */

// This class implements a simple parser for xml documents
public class XmlParser {
    File inputFile;
    DocumentBuilderFactory dbFactory;
    DocumentBuilder dBuilder;
    Document doc;

    // Read the xml file
    public XmlParser(String fileName) throws SAXException, IOException, ParserConfigurationException
    {
    	inputFile = new File(fileName);
    	dbFactory = DocumentBuilderFactory.newInstance();
        dBuilder = dbFactory.newDocumentBuilder();
        doc = dBuilder.parse(inputFile);
        doc.getDocumentElement().normalize();
    }

    // Search the xml file for the given webpage/url and return links contained in the webpage.
	public ArrayList<String> getLinks(String url)
	{
		ArrayList<String> urls = new ArrayList<String>();
		NodeList webPages = doc.getElementsByTagName("webpage");

		// loop over available webpages in the xml file
		for (int i = 0; i < webPages.getLength(); i++)
		{
			Node webpage = webPages.item(i);

			if (url.equals(((Element) webpage).getAttribute("name")))
			{
				NodeList links = ((Element)webpage).getElementsByTagName("link");

				for (int j = 0; j < links.getLength(); j++)
				{
					Node link = links.item(j);
					urls.add(((Element) link).getAttribute("name"));
				}
			}
		}

		return urls;
	}

	// Search the xml file for the given webpage/url and return the text contents of the webpage.
	public ArrayList<String> getContent(String url)
	{
		ArrayList<String> tokens = new ArrayList<String>();
		NodeList webPages = doc.getElementsByTagName("webpage");

		// loop over available webpages in the xml file
		for (int i = 0; i < webPages.getLength(); i++)
		{
			Node webpage = webPages.item(i);
			if (url.equals(((Element) webpage).getAttribute("name")))
			{
				NodeList links = ((Element)webpage).getElementsByTagName("content");

				for (int j = 0; j < links.getLength(); j++)
				{
					Node link = links.item(j);
					String[] content = ((Element) link).getAttribute("value").split("\\s|,|\\.|!|\\(|\\)|-");
					for (String s : content)
						tokens.add(s);
				}

				break;
			}
		}

		return tokens;
	}

	// Search the xml file for the given webpage/url and return the expected rank of the webpage.
	public double getPageRank(String url)
	{
		NodeList webPages = doc.getElementsByTagName("webpage");

		// loop over available webpages in the xml file
		for (int i = 0; i < webPages.getLength(); i++)
		{
			Node webpage = webPages.item(i);
			if (url.equals(((Element) webpage).getAttribute("name")))
			{
				NodeList rankNode = ((Element)webpage).getElementsByTagName("rank");

				if (rankNode.getLength() > 0)
				{
					Node rank = rankNode.item(0);
					return Double.parseDouble(((Element) rank).getAttribute("value"));
				}

				break;
			}
		}

		return 0.0;
	}


    public static void main(String []args) throws Exception
    {
    	XmlParser xmlParser = new XmlParser("test.xml");

    	ArrayList<String> urls = xmlParser.getLinks("www.ea.com");
    	ArrayList<String> content = xmlParser.getContent("www.ea.com");
    	double rank = xmlParser.getPageRank("www.ea.com");

    	System.out.println("\nShow linked urls");
    	for (String url : urls)
    	{
    		System.out.println(url);
    	}

    	System.out.println("\nWebpage content");
    	for (String c : content)
    	{
    		System.out.println(c);
    	}

    	System.out.println("\nWebpage rank " + rank);
    }
} // end of writeContent