View Javadoc

1   /*
2    * Copyright (c) 2007, Peter Mika All rights reserved.
3    * 
4    * Redistribution and use in source and binary forms, with or without
5    * modification, are permitted provided that the following conditions are met:
6    * 
7    * - Redistributions of source code must retain the above copyright notice, this
8    *   list of conditions and the following disclaimer.
9    * - Redistributions in binary form must reproduce the above copyright notice,
10   *   this list of conditions and the following disclaimer in the documentation
11   *   and/or other materials provided with the distribution. 
12   * - Neither the name of the openrdf.org nor the names of its contributors may
13   *   be used to endorse or promote products derived from this software without
14   *   specific prior written permission.
15   * 
16   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26   * POSSIBILITY OF SUCH DAMAGE.
27   * 
28   */
29  package org.openrdf.elmo.scutter;
30  
31  import java.net.URL;
32  
33  import org.openrdf.concepts.foaf.Person;
34  import org.openrdf.model.Resource;
35  import org.openrdf.model.URI;
36  import org.openrdf.model.Value;
37  import org.openrdf.model.vocabulary.RDF;
38  import org.openrdf.repository.Repository;
39  import org.slf4j.Logger;
40  import org.slf4j.LoggerFactory;
41  
42  
43  /**
44   * This retriever is specific to crawling for FOAF data. It only collects
45   * statements in the RDF, RDF-S, FOAF and GEO namespaces and doesn't follow
46   * seeAlso links in documents that do not contain foaf:Person instances.
47   * 
48   * @author Peter Mika (pmika@cs.vu.nl)
49   * 
50   */
51  public class FoafRetriever extends SimpleRetriever implements Retriever {
52  
53     
54     protected final static Logger _logger = LoggerFactory.getLogger(FoafRetriever.class);
55      
56     public FoafRetriever(final URL url, final Repository repository, 
57              final Scutter scutter) {
58          super(url, repository, scutter);
59         
60          setFilter(new FOAFStatementFilter());
61          _handler = new FOAFDocumentHandler((FOAFStatementFilter)_filter);
62      }   
63   
64      static class FOAFStatementFilter implements StatementFilter {
65          public final static String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
66          public final static String RDFS_NS = "http://www.w3.org/2000/01/rdf-schema#";
67          public final static String FOAF_NS = "http://xmlns.com/foaf/0.1/";
68          public final static String GEO_NS = "http://www.w3.org/2003/01/geo/wgs84_pos#";
69  
70          // TODO: factor our openacademia specific parts
71          public final static String SWRC_NS = "http://swrc.ontoware.org/ontology#";
72          public final static String SWRC_EXT_NS = "http://www.cs.vu.nl/~mcaklein/onto/swrc-ext/2005/05#";
73          public final static String SOCIONET_NS = "http://www.cs.vu.nl/~pmika/socionet#";
74          public final static String OPENACADEMIA_NS = "http://www.openacademia.org#";
75          
76          private boolean foundRelevant = false;
77          
78          public boolean allowStatement(Resource subject, URI predicate, Value object) {
79              boolean relevant = false;
80             
81              // if we find a Person definition or any of the namespace
82              // used by openacademia then there is relevant data in the
83              if (!foundRelevant && (predicate.equals(RDF.TYPE) && 
84                  object.equals(Util.getType(Person.class))) ||
85                  // TODO: factor out openacademia specific parts
86                  predicate.toString().startsWith(SWRC_NS) ||
87                  predicate.toString().startsWith(SWRC_EXT_NS) ||
88                  predicate.toString().startsWith(SOCIONET_NS) ||
89                  predicate.toString().startsWith(OPENACADEMIA_NS) 
90                  ) {
91                  relevant = true;
92                  foundRelevant = true;
93              }
94              // What is relevant is a subset of the statements we allow through
95              if (predicate.toString().startsWith(RDF_NS) ||
96                  predicate.toString().startsWith(RDFS_NS) ||
97                  predicate.toString().startsWith(FOAF_NS) ||
98                  predicate.toString().startsWith(GEO_NS) || 
99                  relevant
100             ) {
101                 return true;
102             } else {
103                 return false;
104             }
105         }
106         
107         
108     }
109 
110    class FOAFDocumentHandler extends SimpleDocumentHandler {
111         
112         
113         public FOAFDocumentHandler(FOAFStatementFilter filter) {
114         	super(filter);
115         }
116 
117         
118         public boolean followLinks() {
119         	return ((FOAFStatementFilter) _filter).foundRelevant;
120         }
121         
122         public boolean aggregateContent() {
123         	return true;
124         }
125     }
126 
127 }
128 
129 
130