View Javadoc

1   /*
2    * Copyright (c) 2007, Peter Mika All rights reserved.
3    * 
4    * Redistribution and use in source and binary forms, with or without
5    * modification, are permitted provided that the following conditions are met:
6    * 
7    * - Redistributions of source code must retain the above copyright notice, this
8    *   list of conditions and the following disclaimer.
9    * - Redistributions in binary form must reproduce the above copyright notice,
10   *   this list of conditions and the following disclaimer in the documentation
11   *   and/or other materials provided with the distribution. 
12   * - Neither the name of the openrdf.org nor the names of its contributors may
13   *   be used to endorse or promote products derived from this software without
14   *   specific prior written permission.
15   * 
16   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26   * POSSIBILITY OF SUCH DAMAGE.
27   * 
28   */
29  package org.openrdf.elmo.smusher;
30  
31  import info.aduna.collections.iterators.Iterators;
32  import info.aduna.iteration.Iterations;
33  
34  import java.util.Iterator;
35  import java.util.List;
36  
37  import org.openrdf.OpenRDFException;
38  import org.openrdf.elmo.sesame.SesameManager;
39  import org.openrdf.elmo.smusher.listener.SmusherListener;
40  import org.openrdf.model.Resource;
41  import org.openrdf.model.Value;
42  import org.openrdf.query.BindingSet;
43  import org.openrdf.query.TupleQueryResult;
44  import org.openrdf.repository.RepositoryConnection;
45  import org.slf4j.Logger;
46  import org.slf4j.LoggerFactory;
47  
48  public class NameSmusher extends AbstractSmusher {
49  
50      private final static String NAME_QUERY = 
51          "SELECT DISTINCT person, name FROM {person} foaf:name {name} " + " UNION " +
52          "SELECT DISTINCT person, name FROM {person} owl:sameAs {other} foaf:name {name} " + 
53          //"SELECT DISTINCT person, name FROM {other} owl:sameAs {person}, {person} foaf:name {name} " + 
54          
55          "USING NAMESPACE foaf=<" + org.openrdf.model.vocabulary.FOAF.NAMESPACE + ">," +
56          "owl=<" + org.openrdf.model.vocabulary.OWL.NAMESPACE + ">";
57      
58     protected final static Logger _logger = LoggerFactory.getLogger(NameSmusher.class);
59  
60     public void smush(SesameManager firstRepository, SesameManager secondRepository) throws OpenRDFException {
61         //Start all the registered listeners
62         Iterator it = _listeners.iterator();
63         while (it.hasNext()) {
64             SmusherListener nextListener = (SmusherListener) it.next();
65             nextListener.start();
66         }
67         
68       	RepositoryConnection firstConnection = firstRepository.getConnection();
69  		RepositoryConnection secondConnection = null;
70  		   		
71  		
72  		
73  		
74  		try {
75  	     		
76   		    List<BindingSet> firstResult = Iterations.asList(firstConnection.prepareTupleQuery(org.openrdf.query.QueryLanguage.SERQL,
77   						NAME_QUERY).evaluate());
78   		    List<BindingSet> secondResult = firstResult;
79   		    if (!firstRepository.equals(secondRepository)) {
80              	secondConnection = secondRepository.getConnection();
81       		    secondResult = Iterations.asList(secondConnection.prepareTupleQuery(org.openrdf.query.QueryLanguage.SERQL,
82       						NAME_QUERY).evaluate());
83       		} 
84          
85   		   int counter = 1;
86   		   for (int i = 0; i < firstResult.size(); i++) {
87   				if (_logger.isDebugEnabled()) {
88  	                if ((counter++) % 10000 == 0) {
89  	                    _logger.debug("Processed 10000 instances"); 
90  	                }
91  	            }
92   				for (int j = 0; j < secondResult.size(); j++) {
93   					BindingSet firstSet = firstResult.get(i);
94  	    			BindingSet secondSet = secondResult.get(j);
95  	    			Value firstValue = firstSet.getValue("name");
96  	                Value secondValue = secondSet.getValue("name");
97  	                if (org.openrdf.elmo.smusher.Util.matchNames(firstValue.toString(), secondValue.toString())) {
98  	                    Iterator subit = _listeners.iterator();
99  	                    while (subit.hasNext()) {
100 	                        SmusherListener nextListener = (SmusherListener) subit.next();
101 	                        nextListener.matchFound(
102 	                        		(Resource) firstSet.getValue("person"), 
103 	                        		(Resource) secondSet.getValue("person"));    
104 	                    }
105 	                }
106 	               
107 	            }
108  		   }
109         } finally {			
110 			if (firstConnection != null) firstConnection.close();
111 			if (secondConnection != null) secondConnection.close();			
112     	}
113             
114         //Finish all the registered listeners
115         it = _listeners.iterator();
116         while (it.hasNext()) {
117             SmusherListener nextListener = (SmusherListener) it.next();
118             nextListener.finish();
119         }
120 
121     }
122 
123 }