1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 package org.openrdf.elmo.smusher;
30
31 import java.io.UnsupportedEncodingException;
32 import java.security.NoSuchAlgorithmException;
33 import java.util.Iterator;
34
35 import org.openrdf.OpenRDFException;
36 import org.openrdf.elmo.sesame.SesameManager;
37 import org.openrdf.elmo.smusher.listener.SmusherListener;
38 import org.openrdf.elmo.smusher.Util;
39 import org.openrdf.model.Resource;
40 import org.openrdf.model.URI;
41 import org.openrdf.model.Value;
42 import org.openrdf.query.BindingSet;
43 import org.openrdf.query.TupleQueryResult;
44 import org.openrdf.repository.Repository;
45 import org.openrdf.repository.RepositoryConnection;
46 import org.slf4j.Logger;
47 import org.slf4j.LoggerFactory;
48
49 public class EmailSmusher extends AbstractSmusher {
50
51 protected final static Logger _logger = LoggerFactory.getLogger(EmailSmusher.class);
52
53 private final static String EMAIL_AND_CHECKSUM_QUERY =
54 "SELECT DISTINCT person, mail FROM {person} foaf:mbox {mail} " + " UNION " +
55 "SELECT DISTINCT person, mail FROM {person} owl:sameAs {other} foaf:mbox {mail} " + " UNION " +
56
57 "SELECT DISTINCT person, checksum FROM {person} foaf:mbox_sha1sum {checksum} " + " UNION " +
58 "SELECT DISTINCT person, checksum FROM {person} owl:sameAs {other} foaf:mbox_sha1sum {checksum} " +
59
60 "USING NAMESPACE foaf=<" + org.openrdf.model.vocabulary.FOAF.NAMESPACE + ">," +
61 "owl=<" + org.openrdf.model.vocabulary.OWL.NAMESPACE + ">";
62
63 private static String normalizeEmail(Value email) {
64 String value = email.toString();
65 if (email instanceof URI) {
66
67 if (!value.startsWith("mailto:")) {
68 value = value + "mailto:";
69 }
70 try {
71 value = Util.createSHA1(value);
72 } catch (UnsupportedEncodingException e) {
73
74 } catch (NoSuchAlgorithmException e) {
75
76 }
77 }
78 return value;
79 }
80
81
82 public void smush(SesameManager firstRepository, SesameManager secondRepository) throws OpenRDFException {
83
84 Iterator it = _listeners.iterator();
85 while (it.hasNext()) {
86 SmusherListener nextListener = (SmusherListener) it.next();
87 nextListener.start();
88 }
89
90
91
92 RepositoryConnection firstConnection = firstRepository.getConnection();
93 RepositoryConnection secondConnection = null;
94
95 TupleQueryResult firstResult = null;
96 TupleQueryResult secondResult = null;
97
98 try {
99
100 firstResult = firstConnection.prepareTupleQuery(org.openrdf.query.QueryLanguage.SERQL,
101 EMAIL_AND_CHECKSUM_QUERY).evaluate();
102
103 if (!firstRepository.equals(secondRepository)) {
104 secondConnection = secondRepository.getConnection();
105 secondResult = secondConnection.prepareTupleQuery(org.openrdf.query.QueryLanguage.SERQL,
106 EMAIL_AND_CHECKSUM_QUERY).evaluate();
107 }
108 while (firstResult.hasNext()) {
109 while (secondResult.hasNext()) {
110 BindingSet firstSet = firstResult.next();
111 BindingSet secondSet = secondResult.next();
112 Value firstValue = firstSet.getValue("mail");
113 Value secondValue = secondSet.getValue("mail");
114 if (normalizeEmail(firstValue)
115 .equalsIgnoreCase(normalizeEmail(secondValue))) {
116 Iterator subit = _listeners.iterator();
117 while (subit.hasNext()) {
118 SmusherListener nextListener = (SmusherListener) subit.next();
119 nextListener.matchFound(
120 (Resource) firstSet.getValue("person"),
121 (Resource) secondSet.getValue("person"));
122 }
123 }
124
125
126 }
127 }
128 } finally {
129 if (firstResult != null) firstResult.close();
130 if (firstConnection != null) firstConnection.close();
131 if (secondResult != null) secondResult.close();
132 if (secondConnection != null) secondConnection.close();
133 }
134
135
136
137 it = _listeners.iterator();
138 while (it.hasNext()) {
139 SmusherListener nextListener = (SmusherListener) it.next();
140 nextListener.finish();
141 }
142
143 }
144 }