1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 package org.openrdf.elmo.scutter.servlet;
30
31 import java.io.File;
32 import java.io.IOException;
33 import java.io.PrintWriter;
34 import java.net.InetAddress;
35 import java.net.URL;
36 import java.util.Date;
37 import java.util.Iterator;
38 import java.util.List;
39 import java.util.regex.Pattern;
40
41 import javax.servlet.ServletConfig;
42 import javax.servlet.ServletException;
43 import javax.servlet.http.HttpServlet;
44 import javax.servlet.http.HttpServletRequest;
45 import javax.servlet.http.HttpServletResponse;
46
47 import org.openrdf.elmo.scutter.RetrieverFactory;
48 import org.openrdf.elmo.scutter.Scutter;
49 import org.openrdf.elmo.scutter.Util;
50 import org.openrdf.repository.Repository;
51 import org.slf4j.Logger;
52 import org.slf4j.LoggerFactory;
53
54
55
56
57
58
59
60
61
62
63
64 public class ScutterServlet extends HttpServlet {
65
66
67
68
69 private static final long serialVersionUID = -3682817881393469043L;
70
71 private final static String CONTENT_TYPE = "text/html";
72
73
74 private final static int DEFAULT_PRELOAD_SIZE = 500;
75
76 private static final int MAX_URLS = 100;
77
78 private Thread _scutterThread = null;
79
80 private Scutter _scutter = null;
81
82 private Repository _repository = null;
83
84 private File _queueFile = null;
85
86
87 private File _blacklistFile = null;
88
89
90 private int _preloadSize = DEFAULT_PRELOAD_SIZE;
91
92 private String _running = "false";
93
94
95 private String[] _startURLs = null;
96
97 public void init(ServletConfig config) throws ServletException {
98 super.init(config);
99
100
101 _repository = Util.initRepository(config);
102 String focused = config.getInitParameter("focused");
103 RetrieverFactory factory = null;
104 try {
105 if (focused != null && focused.length() > 0) {
106 factory = new RetrieverFactory(_repository, true);
107 } else {
108 factory = new RetrieverFactory(_repository, false);
109 }
110 } catch (Exception e) {
111 throw new ServletException("Factory failed to initialize", e);
112 }
113
114 try {
115 _scutter = new Scutter(factory);
116 } catch (Exception e) {
117 throw new ServletException("Scutter failed to initialize", e);
118 }
119
120
121 String queueName = config.getInitParameter("queue");
122 if (queueName != null) {
123 _queueFile = new File(queueName);
124 _scutter.setQueueFile(_queueFile);
125 }
126
127
128 String blacklistName = config.getInitParameter("blacklist");
129 if (blacklistName != null) {
130 _blacklistFile = new File(blacklistName);
131 _scutter.setBlacklistFile(_blacklistFile);
132 }
133
134
135 String preloadSize = config.getInitParameter("preloadSize");
136 if (preloadSize != null) {
137 try {
138 _preloadSize = Integer.parseInt(preloadSize);
139 } catch (Exception e) {
140 throw new ServletException(
141 "Invalid value for <preloadSize> initialization parameter.");
142 }
143 }
144
145
146 try {
147 String domain = config.getInitParameter("domain");
148
149 if (domain != null && !domain.equals("")) {
150 Pattern domainPattern = Pattern.compile(domain);
151 _scutter.setDomainPattern(domainPattern);
152 }
153 } catch (Exception e) {
154 throw new ServletException(
155 "Invalid value for <domain> initialization parameter.");
156 }
157
158
159 try {
160 String vocab = config.getInitParameter("vocab");
161
162 if (vocab != null && vocab.equalsIgnoreCase("foaf")) {
163 factory.setFoafOnly(true);
164 }
165 } catch (Exception e) {
166 throw new ServletException(
167 "Invalid value for <vocab> initialization parameter.");
168 }
169
170
171 String startString = config.getInitParameter("start");
172 try {
173 _startURLs = new String[MAX_URLS];
174 if (startString != null && !startString.equals("")) {
175 String[] terms = startString.split("\\s");
176 int count = 0;
177 for (int i=0; i < Math.min(terms.length, MAX_URLS); i++) {
178 terms[i] = terms[i].trim();
179 if (terms[i].startsWith("http:")) {
180 _startURLs[count++]=terms[i];
181 }
182 }
183 _scutter.initQueue(_startURLs);
184 }
185 } catch (Exception e) {
186 throw new ServletException(e);
187 }
188
189
190 String metadata = config.getInitParameter("metadata");
191
192 if (metadata != null && metadata.equalsIgnoreCase("true")) {
193 _scutter.setStoreMetadata(true);
194 }
195
196
197 String sizelimit = config.getInitParameter("sizelimit");
198 if (sizelimit != null) {
199 try {
200 _scutter.setSizeLimit(Integer.parseInt(sizelimit));
201 } catch (NumberFormatException nfe) {
202 throw new ServletException(
203 "Invalid value for <sizelimit> initialization parameter.");
204 }
205 }
206
207
208 String autoblacklist = config.getInitParameter("autoblacklist");
209 if (autoblacklist != null && autoblacklist.equalsIgnoreCase("false")) {
210 _scutter.setAutoBlackList(false);
211 } else {
212 _scutter.setAutoBlackList(true);
213 }
214
215
216 String maxThreadsString = config.getInitParameter("maxThreads");
217 if (maxThreadsString != null) {
218 try {
219 _scutter.setMaxThreads(Integer.parseInt(maxThreadsString));
220 } catch (NumberFormatException nfe) {
221 throw new ServletException(
222 "Invalid value for <maxThreads> initialization parameter.");
223 }
224 }
225
226 }
227
228 public void doGet(HttpServletRequest request, HttpServletResponse response)
229 throws ServletException, IOException {
230
231 response.setContentType(CONTENT_TYPE);
232
233 PrintWriter out = response.getWriter();
234 out.println("<html>");
235 out.println("<head>");
236 out.println("<title>");
237 out.println("ScutterServlet");
238 out.println("</title>");
239 out.println("<link rel=\"stylesheet\" href=\"elmo.css\" type=\"text/css\">");
240 out.println("</head>");
241 out.println("<body>");
242 out.println("<br/><b>ScutterServlet on </b>"
243 + this.getServletContext().getServerInfo() + "<b> at </b>"
244 + InetAddress.getLocalHost() + "<b> reporting...</b>");
245
246 out.println("<br><b>Using Sesame repository</b> " + _repository);
247 out.println("<br/>");
248 String operation = request.getParameter("operation");
249 if (operation != null) {
250 if (operation.equals("start")) {
251 if (!_running.equals("true")) {
252 synchronized (_running) {
253
254 _scutterThread = new Thread(_scutter);
255 _scutterThread.start();
256 _running = "true";
257 out
258 .println("<br/><b>Scutter started successfully.</b><br/>");
259 }
260 }
261 } else if (operation.equals("stop")) {
262 if (!_running.equals("false")) {
263 synchronized (_running) {
264
265 try {
266 _scutter.stop();
267 }
268 catch (IOException ioe) {
269 out.println("Saving queue file failed, check servlet configuration.");
270 }
271 _running = "false";
272 }
273 }
274 if (!_scutterThread.isAlive()) {
275 out
276 .println("<br/><b>Scutter stopped successfully.</b><br/>");
277 }
278 }
279
280
281
282
283
284
285
286
287
288
289
290
291
292 else if (operation.equals("preloadQueue")) {
293 int loaded = 0;
294 if (_queueFile.exists()) {
295 loaded = _scutter.loadQueue();
296 }
297 out.println("<br/><b>Scutter queue preloaded from file <i>"
298 + _queueFile + "</i> with <b>" + loaded
299 + "</b> URLs.</b>");
300 } else if (operation.equals("clear")) {
301 if (_scutter.clear()) {
302 out
303 .println("<br/><b>Scutter queue and visited list cleared.</b><br/>");
304 } else {
305 out
306 .println("<br/><b>Error while clearing scutter repository, see logs.</b><br/>");
307 }
308
309 if (_startURLs != null) {
310 _scutter.initQueue(_startURLs);
311 }
312 } else if (operation.equals("add")) {
313 if (request.getParameter("url") != null
314 && !request.getParameter("url").equals("")) {
315 try {
316 _scutter.addURL(new URL((String) request
317 .getParameter("url")));
318 } catch (Exception e) {
319 out
320 .println("<br/><b>Error while adding URL. </b><br/>");
321 e.printStackTrace();
322 }
323 } else {
324 out.println("<b/r><b>Error while adding URL.</b><br/>");
325 }
326 }
327
328 }
329 if (_scutterThread != null && _scutterThread.isAlive()) {
330 out.println("<br/>Status:" + "running");
331 } else {
332 out.println("<br/>Status:" + "stopped");
333 }
334
335 List queue = _scutter.getQueue();
336 out.println("<br/>(1) Queue size:" + queue.size());
337 out.println("<br/>(2) Seen so far:" + _scutter.getVisited().size());
338 out.println("<br/>Crawled so far (2-1):" + (_scutter.getVisited().size() - queue.size()));
339
340
341 if (_scutter.getDomainPattern() != null) {
342 out.println("<br/>Whitelist pattern: " +_scutter.getDomainPattern().pattern());
343 }
344 out.println("<br/>");
345 out.println("<br/>Queue contents:<br/>");
346 int counter = 0;
347 synchronized (queue) {
348 Iterator it = queue.iterator();
349 while (it.hasNext()) {
350 URL nextURL = (URL) it.next();
351 out.println("<br/>" + counter++ + ". " + nextURL);
352 }
353 }
354 out.println("<br/><small>" + new Date() + "</small>");
355 out.println("</body>");
356 out.println("</html>");
357
358 out.close();
359 }
360
361
362
363
364
365
366 public static void main(String[] args) throws Exception {
367
368 Repository repository = Util.initRepository(args[0], args[1]);
369
370 String focused = "true";
371 RetrieverFactory factory = null;
372 try {
373 if (focused != null && focused.length() > 0) {
374 factory = new RetrieverFactory(repository, true);
375 } else {
376 factory = new RetrieverFactory(repository, false);
377 }
378 } catch (Exception e) {
379 throw new ServletException("Factory failed to initialize", e);
380 }
381 Logger logger = LoggerFactory.getLogger(ScutterServlet.class);
382
383
384
385
386
387
388
389 try {
390 Scutter scutter = new Scutter(factory);
391 scutter.addURL(new URL(args[2]));
392 scutter.run();
393 System.out.println("<br/><b>Scutter started successfully.</b><br/>");
394 } catch (Exception e) {
395 throw new ServletException("Scutter failed to initialize", e);
396 }
397
398
399
400 }
401 }