View Javadoc

1   /*
2    * Copyright (c) 2007, Peter Mika All rights reserved.
3    * 
4    * Redistribution and use in source and binary forms, with or without
5    * modification, are permitted provided that the following conditions are met:
6    * 
7    * - Redistributions of source code must retain the above copyright notice, this
8    *   list of conditions and the following disclaimer.
9    * - Redistributions in binary form must reproduce the above copyright notice,
10   *   this list of conditions and the following disclaimer in the documentation
11   *   and/or other materials provided with the distribution. 
12   * - Neither the name of the openrdf.org nor the names of its contributors may
13   *   be used to endorse or promote products derived from this software without
14   *   specific prior written permission.
15   * 
16   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26   * POSSIBILITY OF SUCH DAMAGE.
27   * 
28   */
29  package org.openrdf.elmo.smusher;
30  
31  import java.io.UnsupportedEncodingException;
32  import java.security.NoSuchAlgorithmException;
33  import java.util.Iterator;
34  
35  import org.openrdf.OpenRDFException;
36  import org.openrdf.elmo.sesame.SesameManager;
37  import org.openrdf.elmo.smusher.listener.SmusherListener;
38  import org.openrdf.elmo.smusher.Util;
39  import org.openrdf.model.Resource;
40  import org.openrdf.model.URI;
41  import org.openrdf.model.Value;
42  import org.openrdf.query.BindingSet;
43  import org.openrdf.query.TupleQueryResult;
44  import org.openrdf.repository.Repository;
45  import org.openrdf.repository.RepositoryConnection;
46  import org.slf4j.Logger;
47  import org.slf4j.LoggerFactory;
48  
49  public class EmailSmusher extends AbstractSmusher {
50       
51  	protected final static Logger _logger = LoggerFactory.getLogger(EmailSmusher.class);
52  	   
53      private final static String EMAIL_AND_CHECKSUM_QUERY = 
54          "SELECT DISTINCT person, mail FROM {person} foaf:mbox {mail} " + " UNION " +
55          "SELECT DISTINCT person, mail FROM {person} owl:sameAs {other} foaf:mbox {mail} " + " UNION " +
56          //"SELECT DISTINCT person, mail FROM {other} owl:sameAs {person}, {other} foaf:mbox {mail} " + " UNION " +   
57          "SELECT DISTINCT person, checksum FROM {person} foaf:mbox_sha1sum {checksum} " + " UNION " +
58          "SELECT DISTINCT person, checksum FROM {person} owl:sameAs {other} foaf:mbox_sha1sum {checksum} " +     
59          //"SELECT DISTINCT person, checksum FROM {other} owl:sameAs {person}, {other} foaf:mbox_sha1sum {checksum} " +   
60          "USING NAMESPACE foaf=<" + org.openrdf.model.vocabulary.FOAF.NAMESPACE + ">," +
61          "owl=<" + org.openrdf.model.vocabulary.OWL.NAMESPACE + ">";
62     
63      private static String normalizeEmail(Value email) {
64          String value = email.toString();
65          if (email instanceof URI) {
66              //add mailto: if required and calculate checksum
67              if (!value.startsWith("mailto:")) {
68                  value = value + "mailto:";
69              } 
70              try {
71                  value = Util.createSHA1(value);
72              } catch (UnsupportedEncodingException e) {
73                  //ignore
74              } catch (NoSuchAlgorithmException e) {
75                  //ignore
76              }
77          }
78          return value;
79      }
80      
81      	 
82      public void smush(SesameManager firstRepository, SesameManager secondRepository) throws OpenRDFException {
83          //Start all the registered listeners
84          Iterator it = _listeners.iterator();
85          while (it.hasNext()) {
86              SmusherListener nextListener = (SmusherListener) it.next();
87              nextListener.start();
88          }
89          
90      	//Select all emails and checksums and compare them pairwise
91     
92      	RepositoryConnection firstConnection = firstRepository.getConnection();
93   		RepositoryConnection secondConnection = null;
94   		   		
95  		TupleQueryResult firstResult = null;
96  		TupleQueryResult secondResult = null;
97  		
98  		try {
99  	     		
100  		    firstResult = firstConnection.prepareTupleQuery(org.openrdf.query.QueryLanguage.SERQL,
101  						EMAIL_AND_CHECKSUM_QUERY).evaluate();
102  		    
103  		    if (!firstRepository.equals(secondRepository)) {
104             	secondConnection = secondRepository.getConnection();
105      		    secondResult = secondConnection.prepareTupleQuery(org.openrdf.query.QueryLanguage.SERQL,
106      						EMAIL_AND_CHECKSUM_QUERY).evaluate();
107      		} 
108         	while (firstResult.hasNext()) {
109         		while (secondResult.hasNext()) {
110         			BindingSet firstSet = firstResult.next();
111         			BindingSet secondSet = secondResult.next();
112         			Value firstValue = firstSet.getValue("mail");
113                     Value secondValue = secondSet.getValue("mail");
114                     if (normalizeEmail(firstValue)
115                             .equalsIgnoreCase(normalizeEmail(secondValue))) {
116                         Iterator subit = _listeners.iterator();
117                         while (subit.hasNext()) {
118                             SmusherListener nextListener = (SmusherListener) subit.next();
119                             nextListener.matchFound(
120                                     (Resource) firstSet.getValue("person"), 
121                                     (Resource) secondSet.getValue("person"));    
122                         }
123                     }
124                    
125         		
126         		}        		
127         	}       	
128 		} finally {
129 			if (firstResult != null) firstResult.close();
130 			if (secondResult != null) secondResult.close();
131     	}
132         		
133          
134         //Finish all the registered listeners
135         it = _listeners.iterator();
136         while (it.hasNext()) {
137             SmusherListener nextListener = (SmusherListener) it.next();
138             nextListener.finish();
139         }
140         
141     }
142 }