Skip to content

Commit 670d56c

Browse files
committed
Big refactor:
multiple interfaces
1 parent 2d26e91 commit 670d56c

26 files changed

+1239
-430
lines changed

README.md

Lines changed: 200 additions & 48 deletions
Large diffs are not rendered by default.

src/main/java/info/debatty/java/stringsimilarity/Cosine.java

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,15 @@
2424

2525
package info.debatty.java.stringsimilarity;
2626

27+
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
28+
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
29+
2730
/**
2831
* @author Thibault Debatty
2932
*/
30-
public class Cosine extends SetBasedStringSimilarity {
33+
public class Cosine extends ShingleBased implements
34+
NormalizedStringDistance, NormalizedStringSimilarity{
3135

32-
/**
33-
* @param args the command line arguments
34-
*/
3536
public static void main(String[] args) {
3637
Cosine cos = new Cosine(3);
3738

@@ -49,10 +50,12 @@ public static void main(String[] args) {
4950
// 1 1
5051
// similarity = .95
5152
System.out.println(cos.similarity("ABAB", "BAB"));
53+
54+
5255
}
5356

5457
/**
55-
* Implements Cosine Similarity.
58+
* Implements Cosine Similarity between strings.
5659
* The strings are first transformed in vectors of occurrences of k-shingles
5760
* (sequences of k characters). In this n-dimensional space, the similarity
5861
* between the two strings is the cosine of their respective vectors.
@@ -64,19 +67,20 @@ public Cosine(int k) {
6467
}
6568

6669
public Cosine() {
67-
super(3);
70+
super();
6871
}
69-
70-
71-
public double similarity(int[] profile1, int[] profile2) {
72-
72+
73+
74+
public double similarity(String s1, String s2) {
75+
KShingling ks = new KShingling(k);
76+
int[] profile1 = ks.getArrayProfile(s1);
77+
int[] profile2 = ks.getArrayProfile(s2);
78+
7379
return dotProduct(profile1, profile2) / (norm(profile1) * norm(profile2));
7480
}
75-
76-
7781

7882
/**
79-
* Compute the norm L2 : sqrt(Sum_i( v_i^2))
83+
* Compute the norm L2 : sqrt(Sum_i( v_i²))
8084
* @param profile
8185
* @return L2 norm
8286
*/
@@ -101,4 +105,9 @@ protected static double dotProduct(int[] profile1, int[] profile2) {
101105
}
102106
return agg;
103107
}
108+
109+
public double distance(String s1, String s2) {
110+
return 1.0 - similarity(s1, s2);
111+
}
112+
104113
}

src/main/java/info/debatty/java/stringsimilarity/Damerau.java

Lines changed: 15 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
package info.debatty.java.stringsimilarity;
2525

2626
import java.util.HashMap;
27+
import info.debatty.java.stringsimilarity.interfaces.StringDistance;
2728

2829
/**
2930
* Implementation of Damerau-Levenshtein distance, computed as the
@@ -34,38 +35,34 @@
3435
* This is not to be confused with the optimal string alignment distance, which
3536
* is an extension where no substring can be edited more than once.
3637
*
38+
* Also, Damerau-Levenshting does not respect triangle inequality, and is thus
39+
* not a metric distance.
40+
*
3741
* @author Thibault Debatty
3842
*/
39-
public class Damerau implements StringSimilarityInterface {
43+
public class Damerau implements StringDistance {
4044

41-
/**
42-
* @param args the command line arguments
43-
*/
45+
4446
public static void main(String[] args) {
4547

4648
Damerau d = new Damerau();
47-
// 1 switch
48-
System.out.println(d.absoluteDistance("ABCDEF", "ABDCEF"));
4949

50-
// 2 switches
51-
System.out.println(d.absoluteDistance("ABCDEF", "BACDFE"));
50+
// 1 substitution
51+
System.out.println(d.distance("ABCDEF", "ABDCEF"));
5252

53-
// 1 deletion
54-
System.out.println(d.absoluteDistance("ABCDEF", "ABCDE"));
53+
// 2 substitutions
54+
System.out.println(d.distance("ABCDEF", "BACDFE"));
5555

5656
// 1 deletion
57-
System.out.println(d.absoluteDistance("ABCDEF", "BCDEF"));
58-
System.out.println(d.absoluteDistance("ABCDEF", "ABCGDEF"));
59-
System.out.println(d.absoluteDistance("ABCDEF", "BCDAEF"));
60-
61-
System.out.println(d.distance("ABCDEF", "GHABCDE"));
57+
System.out.println(d.distance("ABCDEF", "ABCDE"));
58+
System.out.println(d.distance("ABCDEF", "BCDEF"));
59+
System.out.println(d.distance("ABCDEF", "ABCGDEF"));
6260

6361
// All different
64-
System.out.println(d.absoluteDistance("ABCDEF", "POIU"));
65-
System.out.println(d.similarity("ABCDEF", "POIU"));
62+
System.out.println(d.distance("ABCDEF", "POIU"));
6663
}
6764

68-
public int absoluteDistance(String s1, String s2) {
65+
public double distance(String s1, String s2) {
6966

7067
// INFinite distance is the max possible distance
7168
int INF = s1.length() + s2.length();
@@ -128,14 +125,6 @@ public int absoluteDistance(String s1, String s2) {
128125

129126
return H[s1.length() + 1][s2.length() + 1];
130127
}
131-
132-
public double similarity(String s1, String s2) {
133-
return 1.0 - distance(s1, s2);
134-
}
135-
136-
public double distance(String s1, String s2) {
137-
return (double) absoluteDistance(s1, s2) / Math.max(s1.length(), s2.length());
138-
}
139128

140129
protected static int min(int a, int b, int c, int d) {
141130
return Math.min(a, Math.min(b, Math.min(c, d)));

src/main/java/info/debatty/java/stringsimilarity/Jaccard.java

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,17 @@
2424

2525
package info.debatty.java.stringsimilarity;
2626

27+
import info.debatty.java.stringsimilarity.interfaces.MetricStringDistance;
28+
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
29+
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
30+
2731
/**
2832
*
2933
* @author Thibault Debatty
3034
*/
31-
public class Jaccard extends SetBasedStringSimilarity {
35+
public class Jaccard extends ShingleBased implements
36+
MetricStringDistance, NormalizedStringDistance, NormalizedStringSimilarity {
3237

33-
/**
34-
* @param args the command line arguments
35-
*/
3638
public static void main(String[] args) {
3739
Jaccard j2 = new Jaccard(2);
3840

@@ -41,10 +43,9 @@ public static void main(String[] args) {
4143
// 1 1 1 0 1
4244
// => 3 / 5 = 0.6
4345
System.out.println(j2.similarity("ABCDE", "ABCDF"));
46+
4447
}
4548

46-
47-
4849
/**
4950
* The strings are first transformed into sets of k-shingles (sequences of k
5051
* characters), then Jaccard index is computed as |A inter B| / |A union B|.
@@ -56,25 +57,19 @@ public Jaccard(int k) {
5657
super(k);
5758
}
5859

60+
/**
61+
*
62+
*/
5963
public Jaccard() {
60-
super(3);
64+
super();
6165
}
6266

6367

64-
/**
65-
* Compute and return the Jaccard index similarity between two string profiles.
66-
*
67-
* E.g:
68-
* ks = new KShingling(4)
69-
* profile1 = ks.getProfile("My String")
70-
* profile2 = ks.getProfile("My other string")
71-
* similarity = jaccard.similarity(profile1, profile2)
72-
*
73-
* @param profile1
74-
* @param profile2
75-
* @return
76-
*/
77-
public double similarity(int[] profile1, int[] profile2) {
68+
public double similarity(String s1, String s2) {
69+
KShingling ks = new KShingling(k);
70+
int[] profile1 = ks.getArrayProfile(s1);
71+
int[] profile2 = ks.getArrayProfile(s2);
72+
7873
int length = Math.max(profile1.length, profile2.length);
7974
profile1 = java.util.Arrays.copyOf(profile1, length);
8075
profile2 = java.util.Arrays.copyOf(profile2, length);
@@ -94,4 +89,9 @@ public double similarity(int[] profile1, int[] profile2) {
9489

9590
return (double) inter / union;
9691
}
92+
93+
94+
public double distance(String s1, String s2) {
95+
return 1.0 - similarity(s1, s2);
96+
}
9797
}

src/main/java/info/debatty/java/stringsimilarity/JaroWinkler.java

Lines changed: 28 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,67 +1,37 @@
11
package info.debatty.java.stringsimilarity;
22

3+
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
4+
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
35
import java.util.Arrays;
46

57
/**
68
*
79
* @author tibo
810
*/
9-
public class JaroWinkler implements StringSimilarityInterface {
11+
public class JaroWinkler implements NormalizedStringSimilarity, NormalizedStringDistance {
1012

1113

1214
public static void main(String[] args) {
1315
JaroWinkler jw = new JaroWinkler();
1416

15-
System.out.println(jw.distance("My string", "My $tring"));
16-
System.out.println(jw.similarity("My string", "My $tring"));
17+
// substitution of s and t
18+
System.out.println(jw.similarity("My string", "My tsring"));
19+
20+
// substitution of s and n
21+
System.out.println(jw.similarity("My string", "My ntrisg"));
1722
}
1823

19-
/**
20-
* Jaro-Winkler is string edit distance that was developed in the area of
21-
* record linkage (duplicate detection) (Winkler, 1990).
22-
*
23-
* The Jaro–Winkler distance metric is designed and best suited for short
24-
* strings such as person names, and to detect typos.
25-
*
26-
* http://en.wikipedia.org/wiki/Jaro-Winkler_distance
27-
*
28-
* @param s0
29-
* @param s1
30-
* @return
31-
*/
32-
public static double Similarity(String s0, String s1) {
33-
JaroWinkler jw = new JaroWinkler();
34-
return jw.similarity(s0, s1);
35-
}
3624

37-
private double threshold = 0.7;
38-
3925
public JaroWinkler() {
4026

4127
}
4228

4329
public JaroWinkler(double threshold) {
4430
this.setThreshold(threshold);
4531
}
46-
47-
@Override
48-
public double similarity(String s1, String s2) {
49-
int[] mtp = matches(s1, s2);
50-
float m = mtp[0];
51-
if (m == 0) {
52-
return 0f;
53-
}
54-
float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3;
55-
float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2]
56-
* (1 - j);
57-
return jw;
58-
}
5932

60-
@Override
61-
public double distance(String s1, String s2) {
62-
return 1.0 - similarity(s1, s2);
63-
}
64-
33+
private double threshold = 0.7;
34+
6535
/**
6636
* Sets the threshold used to determine when Winkler bonus should be used.
6737
* Set to a negative value to get the Jaro distance.
@@ -83,6 +53,24 @@ public double getThreshold() {
8353
return threshold;
8454
}
8555

56+
public double similarity(String s1, String s2) {
57+
int[] mtp = matches(s1, s2);
58+
float m = mtp[0];
59+
if (m == 0) {
60+
return 0f;
61+
}
62+
float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3;
63+
float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2]
64+
* (1 - j);
65+
return jw;
66+
}
67+
68+
69+
public double distance(String s1, String s2) {
70+
return 1.0 - similarity(s1, s2);
71+
}
72+
73+
8674
private int[] matches(String s1, String s2) {
8775
String max, min;
8876
if (s1.length() > s2.length()) {

0 commit comments

Comments
 (0)