View Javadoc

1   /*
2    * Copyright (c) 2007, Peter Mika All rights reserved.
3    * 
4    * Redistribution and use in source and binary forms, with or without
5    * modification, are permitted provided that the following conditions are met:
6    * 
7    * - Redistributions of source code must retain the above copyright notice, this
8    *   list of conditions and the following disclaimer.
9    * - Redistributions in binary form must reproduce the above copyright notice,
10   *   this list of conditions and the following disclaimer in the documentation
11   *   and/or other materials provided with the distribution. 
12   * - Neither the name of the openrdf.org nor the names of its contributors may
13   *   be used to endorse or promote products derived from this software without
14   *   specific prior written permission.
15   * 
16   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26   * POSSIBILITY OF SUCH DAMAGE.
27   * 
28   */
29  package org.openrdf.elmo.scutter.servlet;
30  
31  import java.io.File;
32  import java.io.IOException;
33  import java.io.PrintWriter;
34  import java.net.InetAddress;
35  import java.net.URL;
36  import java.util.Date;
37  import java.util.Iterator;
38  import java.util.List;
39  import java.util.regex.Pattern;
40  
41  import javax.servlet.ServletConfig;
42  import javax.servlet.ServletException;
43  import javax.servlet.http.HttpServlet;
44  import javax.servlet.http.HttpServletRequest;
45  import javax.servlet.http.HttpServletResponse;
46  
47  import org.openrdf.elmo.scutter.RetrieverFactory;
48  import org.openrdf.elmo.scutter.Scutter;
49  import org.openrdf.elmo.scutter.Util;
50  import org.openrdf.repository.Repository;
51  import org.slf4j.Logger;
52  import org.slf4j.LoggerFactory;
53  
54  
55  /**
56   * Web interface to the Elmo scutter. See the provided user manual for the
57   * documentation of initialization and request parameters.
58   * 
59   * 
60   * @author Peter Mika (pmika@cs.vu.nl)
61   *  
62   */
63  
64  public class ScutterServlet extends HttpServlet {
65  
66  	/**
67  	 * 
68  	 */
69  	private static final long serialVersionUID = -3682817881393469043L;
70  
71  	private final static String CONTENT_TYPE = "text/html";
72  
73  	//	max. number of foaf URLs to preload from Google
74  	private final static int DEFAULT_PRELOAD_SIZE = 500;
75  
76  	private static final int MAX_URLS = 100; //maximum number of starting points
77  
78  	private Thread _scutterThread = null;
79  
80  	private Scutter _scutter = null;
81  	
82  	private Repository _repository = null;
83  
84  	private File _queueFile = null; // file used for persistent storage of the
85  								   // queue
86      
87      private File _blacklistFile = null; // file used for persistent storage of the
88         // blacklist
89  
90  	private int _preloadSize = DEFAULT_PRELOAD_SIZE;
91  
92  	private String _running = "false"; //we cannot synchronize on primitive
93  									   // objects
94      
95  	private String[] _startURLs = null; 
96  
97  	public void init(ServletConfig config) throws ServletException {
98  		super.init(config);
99  
100 		
101 		_repository = Util.initRepository(config);
102 		String focused = config.getInitParameter("focused");
103         RetrieverFactory factory = null;
104 		try {
105 			if (focused != null && focused.length() > 0) {
106 				factory = new RetrieverFactory(_repository, true);
107 			} else {
108 				factory = new RetrieverFactory(_repository, false);
109 			}
110 		} catch (Exception e) {
111 			throw new ServletException("Factory failed to initialize", e);
112 		}
113         
114         try {
115             _scutter = new Scutter(factory);
116         } catch (Exception e) {
117             throw new ServletException("Scutter failed to initialize", e);
118         }
119         
120 		//QUEUE
121 		String queueName = config.getInitParameter("queue");
122 		if (queueName != null) {
123             _queueFile = new File(queueName); 
124             _scutter.setQueueFile(_queueFile);
125 		}
126 		
127 		//BLACKLIST
128         String blacklistName = config.getInitParameter("blacklist");
129         if (blacklistName != null) {
130             _blacklistFile = new File(blacklistName);
131             _scutter.setBlacklistFile(_blacklistFile);
132         }
133         
134 		//PRELOADSIZE
135 		String preloadSize = config.getInitParameter("preloadSize");
136 		if (preloadSize != null) {
137 			try {
138 				_preloadSize = Integer.parseInt(preloadSize);
139 			} catch (Exception e) {
140 				throw new ServletException(
141 						"Invalid value for <preloadSize> initialization parameter.");
142 			}
143 		}
144 
145 		//DOMAIN
146 		try {
147 			String domain = config.getInitParameter("domain"); //optional
148 															   // argument
149 			if (domain != null && !domain.equals("")) {
150 				Pattern domainPattern = Pattern.compile(domain);
151 				_scutter.setDomainPattern(domainPattern);
152 			}
153 		} catch (Exception e) {
154 			throw new ServletException(
155 					"Invalid value for <domain> initialization parameter.");
156 		}
157   
158 		//VOCAB
159         try {
160             String vocab = config.getInitParameter("vocab"); //optional
161                                                                // argument
162             if (vocab != null && vocab.equalsIgnoreCase("foaf")) {               
163                 factory.setFoafOnly(true);
164             }
165         } catch (Exception e) {
166             throw new ServletException(
167                     "Invalid value for <vocab> initialization parameter.");
168         }
169         
170 		//START
171 		String startString = config.getInitParameter("start"); //optional argument
172 		try {
173 		    _startURLs = new String[MAX_URLS];
174 			if (startString != null && !startString.equals("")) {
175 				String[] terms = startString.split("\\s");
176 				int count = 0;
177 				for (int i=0; i < Math.min(terms.length, MAX_URLS); i++) {
178 					terms[i] = terms[i].trim();
179 					if (terms[i].startsWith("http:")) {
180 						_startURLs[count++]=terms[i];
181 					}
182 				}
183 				_scutter.initQueue(_startURLs);
184 			}
185 		} catch (Exception e) {
186 			throw new ServletException(e);
187 		}
188 
189 		//METADATA
190 		String metadata = config.getInitParameter("metadata"); //optional
191 															   // argument
192 		if (metadata != null && metadata.equalsIgnoreCase("true")) {
193 			_scutter.setStoreMetadata(true);
194 		}
195 		
196 		//SIZELIMIT
197 		String sizelimit = config.getInitParameter("sizelimit"); //optional argument
198 		if (sizelimit != null) {
199 			try {
200 				_scutter.setSizeLimit(Integer.parseInt(sizelimit));
201 			} catch (NumberFormatException nfe) {
202 				throw new ServletException(
203 				"Invalid value for <sizelimit> initialization parameter.");
204 			}
205 		}
206         
207 		//AUTOBLACKLIST
208         String autoblacklist = config.getInitParameter("autoblacklist"); //optional argument
209         if (autoblacklist != null && autoblacklist.equalsIgnoreCase("false")) {
210             _scutter.setAutoBlackList(false);
211         } else {
212             _scutter.setAutoBlackList(true);
213         }
214 
215         //MAXTHREADS
216         String maxThreadsString = config.getInitParameter("maxThreads");
217         if (maxThreadsString != null) {
218             try {
219                 _scutter.setMaxThreads(Integer.parseInt(maxThreadsString));
220             } catch (NumberFormatException nfe) {
221                 throw new ServletException(
222                 "Invalid value for <maxThreads> initialization parameter.");
223             }
224         }
225         
226 	}
227 
228 	public void doGet(HttpServletRequest request, HttpServletResponse response)
229 			throws ServletException, IOException {
230 
231 		response.setContentType(CONTENT_TYPE);
232 
233 		PrintWriter out = response.getWriter();
234 		out.println("<html>");
235 		out.println("<head>");
236 		out.println("<title>");
237 		out.println("ScutterServlet");
238 		out.println("</title>");
239 		out.println("<link rel=\"stylesheet\" href=\"elmo.css\" type=\"text/css\">");
240 		out.println("</head>");
241 		out.println("<body>");
242 		out.println("<br/><b>ScutterServlet on </b>"
243 				+ this.getServletContext().getServerInfo() + "<b> at </b>"
244 				+ InetAddress.getLocalHost() + "<b> reporting...</b>");
245 		//TODO: write out repository ID
246 		out.println("<br><b>Using Sesame repository</b> " + _repository);
247 		out.println("<br/>");
248 		String operation = request.getParameter("operation");
249 		if (operation != null) {
250 			if (operation.equals("start")) {
251 				if (!_running.equals("true")) {
252 					synchronized (_running) {
253 						
254 						_scutterThread = new Thread(_scutter);
255 						_scutterThread.start();
256 						_running = "true";
257 						out
258 						.println("<br/><b>Scutter started successfully.</b><br/>");
259 					}
260 				}
261 			} else if (operation.equals("stop")) {
262 				if (!_running.equals("false")) {
263 					synchronized (_running) {
264 						
265 						try {
266 						    _scutter.stop();
267 						}
268 						catch (IOException ioe) {
269 						    out.println("Saving queue file failed, check servlet configuration.");
270 						}
271 						_running = "false";
272 					}
273 				}
274 				if (!_scutterThread.isAlive()) {
275 					out
276 							.println("<br/><b>Scutter stopped successfully.</b><br/>");
277 				}
278 			} 
279 			//TODO: consider removing this operation
280 //			else if (operation.equals("search")) {
281 //				List urlList = org.openrdf.elmo.util.Util.googleScrapeURLs(
282 //						"filetype:rdf foaf:Person", _preloadSize);
283 //				String[] urlArray = new String[urlList.size()];
284 //				for (int i = 0; i < urlList.size(); i++) {
285 //					urlArray[i] = (String) urlList.get(i);
286 //				}
287 //				_scutter.initQueue(urlArray);
288 //				out.println("<br/><b>Scutter queue preloaded with <b>"
289 //						+ urlList.size()
290 //						+ "</b> URLs (may not be unique).</b><br/>");
291 //			} 
292 			else if (operation.equals("preloadQueue")) {
293 				int loaded = 0;
294 				if (_queueFile.exists()) {
295 					loaded = _scutter.loadQueue();
296 				}
297 				out.println("<br/><b>Scutter queue preloaded from file <i>"
298 						+ _queueFile + "</i> with <b>" + loaded
299 						+ "</b> URLs.</b>");
300 			} else if (operation.equals("clear")) {
301 				if (_scutter.clear()) {
302 					out
303 							.println("<br/><b>Scutter queue and visited list cleared.</b><br/>");
304 				} else {
305 					out
306 							.println("<br/><b>Error while clearing scutter repository, see logs.</b><br/>");
307 				}
308 				//Put back the startURL
309 				if (_startURLs != null) {					
310 					_scutter.initQueue(_startURLs);
311 				}
312 			} else if (operation.equals("add")) {
313 				if (request.getParameter("url") != null
314 						&& !request.getParameter("url").equals("")) {
315 					try {
316 						_scutter.addURL(new URL((String) request
317 								.getParameter("url")));
318 					} catch (Exception e) {
319 						out
320 								.println("<br/><b>Error while adding URL. </b><br/>");
321 						e.printStackTrace();
322 					}
323 				} else {
324 					out.println("<b/r><b>Error while adding URL.</b><br/>");
325 				}
326 			}
327 
328 		}
329 		if (_scutterThread != null && _scutterThread.isAlive()) {
330 			out.println("<br/>Status:" + "running");
331 		} else {
332 			out.println("<br/>Status:" + "stopped");
333 		}
334 
335 		List queue = _scutter.getQueue();
336 		out.println("<br/>(1) Queue size:" + queue.size());
337 		out.println("<br/>(2) Seen so far:" + _scutter.getVisited().size());
338 		out.println("<br/>Crawled so far (2-1):" + (_scutter.getVisited().size() - queue.size()));
339 		
340 
341 		if (_scutter.getDomainPattern() != null) {
342 			out.println("<br/>Whitelist pattern: " +_scutter.getDomainPattern().pattern());
343 		}
344 		out.println("<br/>");
345 		out.println("<br/>Queue contents:<br/>");
346 		int counter = 0;
347 		synchronized (queue) {
348 			Iterator it = queue.iterator();
349 			while (it.hasNext()) {
350 				URL nextURL = (URL) it.next();
351 				out.println("<br/>" + counter++ + ". " + nextURL);
352 			}
353 		}
354 		out.println("<br/><small>" + new Date() + "</small>");
355 		out.println("</body>");
356 		out.println("</html>");
357 
358 		out.close();
359 	}
360 
361      /** For debugging purposes only.
362      * 
363      * @param args
364      * @throws Exception
365      */
366     public static void main(String[] args) throws Exception {
367         
368         Repository repository = Util.initRepository(args[0], args[1]);
369         
370         String focused = "true";
371         RetrieverFactory factory = null;
372         try {
373             if (focused != null && focused.length() > 0) {
374                 factory = new RetrieverFactory(repository, true);
375             } else {
376                 factory = new RetrieverFactory(repository, false);
377             }
378         } catch (Exception e) {
379             throw new ServletException("Factory failed to initialize", e);
380         }
381         Logger logger = LoggerFactory.getLogger(ScutterServlet.class);
382         
383         //TODO: configure Log4j logging
384 //        logger.setLevel(Level.DEBUG);
385 //        Logger.getLogger("org.apache").setLevel(Level.INFO);
386 //        Logger.getLogger("httpclient").setLevel(Level.INFO);
387         
388         
389         try {
390             Scutter scutter = new Scutter(factory); 
391             scutter.addURL(new URL(args[2]));
392             scutter.run();           
393             System.out.println("<br/><b>Scutter started successfully.</b><br/>");
394         } catch (Exception e) {
395             throw new ServletException("Scutter failed to initialize", e);
396         }
397         
398         
399         
400     }
401 }