1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 package org.openrdf.elmo.smusher;
30
31
32
33
34
35 import info.aduna.collections.iterators.Iterators;
36
37 import java.io.IOException;
38 import java.io.UnsupportedEncodingException;
39 import java.net.MalformedURLException;
40 import java.net.URL;
41 import java.security.MessageDigest;
42 import java.security.NoSuchAlgorithmException;
43 import java.util.Enumeration;
44 import java.util.Hashtable;
45 import java.util.List;
46
47 import javax.servlet.ServletConfig;
48 import javax.servlet.ServletContext;
49 import javax.servlet.ServletException;
50 import javax.servlet.http.HttpServletRequest;
51
52 import org.openrdf.OpenRDFException;
53 import org.openrdf.elmo.ElmoManager;
54 import org.openrdf.elmo.ElmoManagerFactory;
55 import org.openrdf.elmo.ElmoModule;
56 import org.openrdf.elmo.ElmoQuery;
57 import org.openrdf.elmo.sesame.SesameManager;
58 import org.openrdf.elmo.sesame.SesameManagerFactory;
59 import org.openrdf.model.Literal;
60 import org.openrdf.model.Statement;
61 import org.openrdf.model.URI;
62 import org.openrdf.model.impl.LiteralImpl;
63 import org.openrdf.repository.Repository;
64 import org.openrdf.repository.RepositoryConnection;
65 import org.openrdf.repository.RepositoryException;
66 import org.openrdf.repository.http.HTTPRepository;
67 import org.slf4j.Logger;
68 import org.slf4j.LoggerFactory;
69
70 public class Util {
71
72
73 public final static double SIMILARITY_THRESHOLD = 0.85;
74
75 private final static int NGRAM_SIZE = 2;
76
77
78
79 public final static String SERVLET_SERVER_PARAMETER = "server";
80
81 public final static String SERVLET_REPOSITORY_PARAMETER = "repository";
82
83 public final static String SERVLET_USERNAME_PARAMETER = "username";
84
85 public final static String SERVLET_PASSWORD_PARAMETER = "password";
86
87 public final static String CONTEXT_SERVER_PARAMETER = SERVLET_SERVER_PARAMETER;
88
89 public final static String CONTEXT_REPOSITORY_PARAMETER = SERVLET_REPOSITORY_PARAMETER;
90
91 public final static String CONTEXT_USERNAME_PARAMETER = SERVLET_USERNAME_PARAMETER;
92
93 public final static String CONTEXT_PASSWORD_PARAMETER = SERVLET_PASSWORD_PARAMETER;
94
95
96 public final static String REQUEST_REPOSITORY_ATTRIBUTE = "sesame";
97
98 protected static Logger _logger = LoggerFactory.getLogger(Util.class);
99
100
101
102 private static Repository getLocalRepository(String name) throws RepositoryException {
103
104
105
106
107
108
109
110
111 throw new RuntimeException("Not implemented");
112 }
113
114 private static Repository getRemoteRepository(String sesameServer, String repositoryID) throws RepositoryException {
115 Repository repository = new HTTPRepository(sesameServer, repositoryID);
116 repository.initialize();
117
118 return repository;
119 }
120
121 public static SesameManager initManager(Repository repository) {
122 ElmoManagerFactory factory = new SesameManagerFactory(new ElmoModule(), repository);
123 ElmoManager manager = factory.createElmoManager();
124
125 return (SesameManager) manager;
126 }
127
128
129
130
131
132
133
134
135
136 public static Repository initRepository(String repositoryLocation,
137 String repositoryName) throws Exception {
138 return initRepository(repositoryLocation, repositoryName, null, null);
139 }
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159 public static Repository initRepository(String repositoryLocation,
160 String repositoryName, String username, String password)
161 throws Exception {
162 Repository repository = null;
163
164
165 if ((repositoryLocation == null || repositoryLocation.equals("")) &&
166 repositoryName != null && !repositoryName.equals("")) {
167
168
169
170 repository = getLocalRepository(repositoryName);
171
172 } else {
173
174 try {
175 URL repositoryURL = new URL(repositoryLocation);
176 } catch (MalformedURLException mue) {
177 throw new Exception("Repository location contains a malformed URL");
178 }
179 if (repositoryName == null || repositoryName.equals("")) {
180 throw new Exception("Repository name missing");
181 }
182
183 repository = getRemoteRepository(repositoryLocation, repositoryName);
184
185 }
186
187 return repository;
188 }
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209 public static Repository initRepository(HttpServletRequest request)
210 throws ServletException {
211 String repositoryLocation = null;
212 String repositoryName = null;
213 String userName = null;
214 String password = null;
215
216 Repository repository = null;
217
218 if (request.getAttribute(REQUEST_REPOSITORY_ATTRIBUTE) != null) {
219 repository = (Repository) request
220 .getAttribute(REQUEST_REPOSITORY_ATTRIBUTE);
221 } else {
222
223 repositoryLocation = request.getParameter(SERVLET_SERVER_PARAMETER);
224 repositoryName = request.getParameter(SERVLET_REPOSITORY_PARAMETER);
225
226 if (request.getParameter(SERVLET_USERNAME_PARAMETER) != null
227 && request.getParameter(SERVLET_PASSWORD_PARAMETER) != null) {
228
229 userName = request.getParameter(SERVLET_USERNAME_PARAMETER);
230 password = request.getParameter(SERVLET_PASSWORD_PARAMETER);
231 }
232
233 try {
234 repository = Util.initRepository(repositoryLocation,
235 repositoryName, userName, password);
236 } catch (Exception e) {
237 throw new ServletException("Repository failed to initialize", e);
238 }
239 }
240 return repository;
241 }
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265 public static Repository initRepository(ServletConfig config)
266 throws ServletException {
267 String repositoryLocation = null;
268 String repositoryName = null;
269 String userName = null;
270 String password = null;
271
272 Repository repository = null;
273
274 repositoryLocation = config.getInitParameter(SERVLET_SERVER_PARAMETER);
275 repositoryName = config.getInitParameter(SERVLET_REPOSITORY_PARAMETER);
276
277 if (config.getInitParameter(SERVLET_USERNAME_PARAMETER) != null
278 && config.getInitParameter(SERVLET_PASSWORD_PARAMETER) != null) {
279
280 userName = config.getInitParameter(SERVLET_USERNAME_PARAMETER);
281 password = config.getInitParameter(SERVLET_PASSWORD_PARAMETER);
282 }
283
284 try {
285 repository = Util.initRepository(repositoryLocation, repositoryName,
286 userName, password);
287 return repository;
288 } catch (Exception e) {
289
290 }
291
292
293 ServletContext context = config.getServletContext();
294 repositoryLocation = context.getInitParameter(CONTEXT_SERVER_PARAMETER);
295 repositoryName = context.getInitParameter(CONTEXT_REPOSITORY_PARAMETER);
296
297 if (context.getInitParameter(CONTEXT_USERNAME_PARAMETER) != null
298 && context.getInitParameter(CONTEXT_PASSWORD_PARAMETER) != null) {
299
300 userName = context.getInitParameter(CONTEXT_USERNAME_PARAMETER);
301 password = context.getInitParameter(CONTEXT_PASSWORD_PARAMETER);
302 } else {
303 userName = null;
304 password = null;
305 }
306
307 try {
308 repository = Util.initRepository(repositoryLocation, repositoryName,
309 userName, password);
310 } catch (Exception e) {
311 throw new ServletException("Repository failed to initialize", e);
312 }
313
314 return repository;
315 }
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345 public static Repository initRepository(ServletConfig config,
346 HttpServletRequest request) throws ServletException {
347 Repository repository = null;
348
349 try {
350 repository = initRepository(request);
351
352 } catch (ServletException se) {
353
354 }
355
356 if (repository == null) {
357 repository = initRepository(config);
358 }
359 return repository;
360 }
361
362 private static class Gram {
363 String gram;
364
365 int occurences;
366
367 String str;
368
369 public Gram(String inGram, String inString) {
370 gram = inGram;
371 str = inString;
372 occurences = 0;
373 int ng = gram.length();
374 int len = str.length();
375 for (int i = 0; i <= (len - ng); i++) {
376 String gr = str.substring(i, i + ng);
377 if (gram.equalsIgnoreCase(gr)) {
378 occurences++;
379 }
380 }
381 }
382 }
383
384
385 public static double compare(String str1, String str2, int ng) {
386 if (ng < 2) {
387 ng = 2;
388 }
389 if (ng > 9) {
390 ng = 9;
391 }
392 char blank = ' ';
393 char underline = '_';
394 str1 = str1.replace(blank, underline);
395 str2 = str2.replace(blank, underline);
396
397 Hashtable<String, Gram> gramList1 = new Hashtable<String, Gram>();
398 int len = str1.length();
399 for (int i = 0; i <= (len - ng); i++) {
400 String gr = str1.substring(i, i + ng);
401 Gram gram = new Gram(gr, str1);
402 if (gram.occurences > 0) {
403 gramList1.put(gram.gram, gram);
404 }
405 }
406 Hashtable<String, Gram> gramList2 = new Hashtable<String, Gram>();
407 len = str2.length();
408 for (int i = 0; i <= (len - ng); i++) {
409 String gr = str2.substring(i, i + ng);
410 Gram gram = new Gram(gr, str2);
411 if (gram.occurences > 0) {
412 gramList2.put(gram.gram, gram);
413 }
414 }
415 double difference = 0;
416 double sum = 0;
417 Hashtable<String, Gram> diffGrams = new Hashtable<String, Gram>();
418
419 Enumeration keys1 = gramList1.keys();
420 while (keys1.hasMoreElements()) {
421 String key = (String) keys1.nextElement();
422 Gram g = (Gram) gramList1.get(key);
423 double plus = (g.occurences);
424 double squarePlus = Math.pow(plus, 2);
425 sum = sum + squarePlus;
426 if (gramList2.containsKey(key)) {
427 Gram gr2 = (Gram) gramList2.get(key);
428 double minus = (g.occurences - gr2.occurences);
429 double square = Math.pow(minus, 2);
430 difference = difference + square;
431 } else {
432 double square = Math.pow(g.occurences, 2);
433 difference = difference + square;
434 }
435 }
436 Enumeration keys2 = gramList2.keys();
437 while (keys2.hasMoreElements()) {
438 String key = (String) keys2.nextElement();
439 Gram gr2 = (Gram) gramList2.get(key);
440 if (gramList1.containsKey(key)) {
441 } else {
442 diffGrams.put(key, gr2);
443 }
444 }
445 Enumeration diffKeys = diffGrams.keys();
446 while (diffKeys.hasMoreElements()) {
447 String key = (String) diffKeys.nextElement();
448 Gram diffGram = (Gram) diffGrams.get(key);
449 double square = Math.pow(diffGram.occurences, 2);
450 difference = difference + square;
451 double squarePlus = Math.pow(diffGram.occurences, 2);
452 sum = sum + squarePlus;
453 }
454
455 int totalLength = str1.length() - ng + 1 + diffGrams.size();
456 double threshold = 2.486 + 0.025 * totalLength;
457 difference = Math.sqrt(difference);
458
459
460
461
462 double similarity = 0;
463 if (difference < threshold) {
464 similarity = 0.8 + ((threshold - difference) / (5 * threshold));
465 } else if (difference > threshold) {
466 similarity = 0.8 - (((difference - threshold) * 4) / ((1 + difference - threshold) * 5));
467 } else {
468 similarity = 0.8;
469 }
470
471 return similarity;
472 }
473
474
475
476
477
478
479
480
481 public static String invertName(String name) {
482 int delim = name.indexOf(',');
483 if (delim != -1) {
484 return name.substring(delim + 1, name.length()).trim() + ' ' +
485 name.substring(0, delim).trim();
486 } else {
487 return name;
488 }
489 }
490
491
492
493
494
495
496
497 public static String getFirstName(String name) {
498
499 if (name.indexOf(',') != -1) {
500 name = invertName(name);
501 }
502 int delim = name.indexOf(" ");
503 if (delim < 0) {
504
505 delim = name.length();
506 }
507 return name.substring(0, delim);
508 }
509
510
511
512
513
514
515 public static String getLastName(String name) {
516
517 if (name.indexOf(',') != -1) {
518 name = invertName(name);
519 }
520 int delim = name.lastIndexOf(" ");
521 if (delim < 0) {
522
523 return "";
524 } else {
525 return name.substring(delim + 1, name.length());
526 }
527 }
528
529
530
531
532
533
534
535
536 public static boolean matchNames(String name1, String name2) {
537 boolean result = true;
538
539 String first1 = getFirstName(name1.trim());
540 String last1 = getLastName(name1.trim());
541 String first2 = getFirstName(name2.trim());
542 String last2 = getLastName(name2.trim());
543
544
545 if (last1.length() < 2 || last2.length() < 2) {
546 return false;
547 }
548
549
550 if (!last1.equalsIgnoreCase(last2)) {
551 return false;
552 }
553
554
555
556 if (first1.indexOf(".") != -1 || first2.indexOf(".") != -1) {
557 if (!(first1.charAt(0) == first2.charAt(0))) {
558 return false;
559 }
560 } else {
561
562 if (
563
564
565 compare(first1, first2, NGRAM_SIZE) < SIMILARITY_THRESHOLD) {
566 return false;
567 }
568 }
569
570 return result;
571 }
572
573
574
575
576
577
578
579
580 public static void addStatememt(Repository rep, Statement stmt)
581 throws OpenRDFException, IOException {
582 RepositoryConnection con = null;
583
584 try {
585 con = rep.getConnection();
586
587 con.add(stmt);
588 }
589 finally {
590 if (con != null)
591 con.close();
592
593
594 }
595 }
596
597
598
599
600
601
602
603
604
605 public static <T extends org.openrdf.concepts.rdfs.Resource> List<T> getAllInstances(ElmoManager manager, Class<T> cl) {
606
607
608 Iterable<T> query = manager.findAll(cl);
609 return Iterators.asList(query.iterator());
610 }
611
612
613
614
615
616
617
618
619
620 public final static String createSHA1(String str)
621 throws UnsupportedEncodingException, NoSuchAlgorithmException {
622 String result = "";
623 byte[] theTextToDigestAsBytes = str.getBytes( "8859_1"
624 MessageDigest md = MessageDigest.getInstance( "SHA" );
625 md.update( theTextToDigestAsBytes );
626 byte[] digest = md.digest();
627
628 for (int i=0; i<digest.length; i++) {
629 String hex = Integer.toHexString(digest[i]);
630 if (hex.length() == 1) hex = "0" + hex;
631 result += hex.substring(hex.length()-2);
632 }
633 return result.toUpperCase();
634 }
635
636
637
638
639
640
641
642
643 public final static Literal createSha1Sum(String email)
644 throws UnsupportedEncodingException, NoSuchAlgorithmException {
645 return new LiteralImpl(createSHA1("mailto:" + email));
646 }
647
648
649
650
651
652
653
654
655 public final static Literal createSha1Sum(URI email)
656 throws UnsupportedEncodingException, NoSuchAlgorithmException {
657 return new LiteralImpl(createSHA1(email.toString()));
658 }
659
660 }