1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 package org.openrdf.elmo.smusher;
30
31 import info.aduna.collections.iterators.Iterators;
32 import info.aduna.iteration.Iterations;
33
34 import java.util.Iterator;
35 import java.util.List;
36
37 import org.openrdf.OpenRDFException;
38 import org.openrdf.elmo.sesame.SesameManager;
39 import org.openrdf.elmo.smusher.listener.SmusherListener;
40 import org.openrdf.model.Resource;
41 import org.openrdf.model.Value;
42 import org.openrdf.query.BindingSet;
43 import org.openrdf.query.TupleQueryResult;
44 import org.openrdf.repository.RepositoryConnection;
45 import org.slf4j.Logger;
46 import org.slf4j.LoggerFactory;
47
48 public class NameSmusher extends AbstractSmusher {
49
50 private final static String NAME_QUERY =
51 "SELECT DISTINCT person, name FROM {person} foaf:name {name} " + " UNION " +
52 "SELECT DISTINCT person, name FROM {person} owl:sameAs {other} foaf:name {name} " +
53
54
55 "USING NAMESPACE foaf=<" + org.openrdf.model.vocabulary.FOAF.NAMESPACE + ">," +
56 "owl=<" + org.openrdf.model.vocabulary.OWL.NAMESPACE + ">";
57
58 protected final static Logger _logger = LoggerFactory.getLogger(NameSmusher.class);
59
60 public void smush(SesameManager firstRepository, SesameManager secondRepository) throws OpenRDFException {
61
62 Iterator it = _listeners.iterator();
63 while (it.hasNext()) {
64 SmusherListener nextListener = (SmusherListener) it.next();
65 nextListener.start();
66 }
67
68 RepositoryConnection firstConnection = firstRepository.getConnection();
69 RepositoryConnection secondConnection = null;
70
71
72
73
74 try {
75
76 List<BindingSet> firstResult = Iterations.asList(firstConnection.prepareTupleQuery(org.openrdf.query.QueryLanguage.SERQL,
77 NAME_QUERY).evaluate());
78 List<BindingSet> secondResult = firstResult;
79 if (!firstRepository.equals(secondRepository)) {
80 secondConnection = secondRepository.getConnection();
81 secondResult = Iterations.asList(secondConnection.prepareTupleQuery(org.openrdf.query.QueryLanguage.SERQL,
82 NAME_QUERY).evaluate());
83 }
84
85 int counter = 1;
86 for (int i = 0; i < firstResult.size(); i++) {
87 if (_logger.isDebugEnabled()) {
88 if ((counter++) % 10000 == 0) {
89 _logger.debug("Processed 10000 instances");
90 }
91 }
92 for (int j = 0; j < secondResult.size(); j++) {
93 BindingSet firstSet = firstResult.get(i);
94 BindingSet secondSet = secondResult.get(j);
95 Value firstValue = firstSet.getValue("name");
96 Value secondValue = secondSet.getValue("name");
97 if (org.openrdf.elmo.smusher.Util.matchNames(firstValue.toString(), secondValue.toString())) {
98 Iterator subit = _listeners.iterator();
99 while (subit.hasNext()) {
100 SmusherListener nextListener = (SmusherListener) subit.next();
101 nextListener.matchFound(
102 (Resource) firstSet.getValue("person"),
103 (Resource) secondSet.getValue("person"));
104 }
105 }
106
107 }
108 }
109 } finally {
110 if (firstConnection != null) firstConnection.close();
111 if (secondConnection != null) secondConnection.close();
112 }
113
114
115 it = _listeners.iterator();
116 while (it.hasNext()) {
117 SmusherListener nextListener = (SmusherListener) it.next();
118 nextListener.finish();
119 }
120
121 }
122
123 }