-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathGnutchGrailsPlugin.groovy
103 lines (80 loc) · 3.69 KB
/
GnutchGrailsPlugin.groovy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import org.apache.activemq.ActiveMQConnectionFactory
import org.apache.activemq.pool.PooledConnectionFactory
import org.apache.camel.component.http4.HttpComponent
import org.apache.camel.Exchange
class GnutchGrailsPlugin {
def version = "0.2.3.2"
def grailsVersion = "2.2 > *"
def loadAfter = ['controllers', 'services', 'routing']
def title = "Grails Apache Nutch alternative"
def documentation = "http://grails.org/plugin/gnutch"
def license = "APACHE"
def developers = [
]
def issueManagement = [system: "GitHub", url: "https://github.com/softsky/gnutch/issues"]
def scm = [url: "https://github.com/softsky/gnutch"]
def description = '''\
Very simple alternative to "Apache Nutch":http://nutch.apache.org/ created in Grails.
Crawled data could be stored to files, saved to database or sent to Apache Solr server for indexing.
Use "Apache Camel":http://camel.apache.org/ as integration framework and "Apache ActiveMQ":http://activemq.apache.org/ as source messaging and integration patterns server.
'''
def doWithSpring = {
// Defaulting config
def conf = application.config.gnutch ?: [:]
println "Config:" + conf
conf.inputRoute = conf.inputRoute ?: 'file:///home/archer/tmp/gnutch-input'
conf.aggregationTime = conf.aggregationTime ?: 30000L
conf.crawl = conf.crawl ?: [threads: 1, multiplier: 1]
conf.handlers = conf.handlers ?: [
postXHTML: { Exchange ex -> },
postXML: { Exchange ex -> },
validate: { Exchange ex -> },
publish: {}
]
conf.http = conf.http ?: [
// UserAgent string. Better if contain email address of person who is responsible
// for crawling. That will allow source owners to contact person directly
userAgent: 'GNutch crawler (https://github.com/softsky/gnutch): [email protected]',
// Maximmum number of connections per host
defaultMaxConnectionsPerHost: 1000,
// Maximmum number of total connections
maxTotalConnections: 1000,
]
conf.activemq = conf.activemq ?: [
// URL to message broker
brokerURL: 'vm://localhost'
// brokerURL: 'tcp://localhost:61616'
// conf: 'classpath:activemq.xml'
]
if (conf.activemq.conf) {
println "Importing activemq configuration from ${conf.activemq.conf}"
importBeans conf.activemq.conf
}
jmsFactory(ActiveMQConnectionFactory) {
brokerURL = conf.activemq.brokerURL
}
jmsConnectionFactory(PooledConnectionFactory) {
connectionFactory = ref('jmsFactory')
maxConnections = 8;
//maximumActive = 500;
}
http4(HttpComponent) {
camelContext = ref('camelContext')
connectionsPerRoute = conf.http.defaultMaxConnectionsPerHost
maxTotalConnections = conf.http.maxTotalConnections
}
docsAggregator(gnutch.processors.DocsAggregator)
patternService(gnutch.urls.PatternService) { bean ->
bean.scope = 'singleton' // explicitly setting scope to `singleton`
}
regexUrlChecker(gnutch.urls.RegexUrlChecker) { bean ->
bean.scope = 'prototype'
bean.factoryMethod = 'getInstance'
}
contextUrlResolver(gnutch.urls.ContextUrlResolver)
documentIndexer(gnutch.indexer.DocumentIndexer)
schedulerService(gnutch.quartz.SchedulerService)
tikaContentExtractor(gnutch.TikaContentExtractor)
}
}