Skip to content

Commit a14db08

Browse files
authored
Merge pull request #6 from paulirwin/java-catchup
Changes to catch up to v0.18 (d3e6340) of upstream java repo
2 parents dbfe452 + 7da474f commit a14db08

13 files changed

+270
-52
lines changed

src/F23.StringSimilarity/Cosine.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ public double Similarity(string s1, string s2)
6868
* @param profile
6969
* @return L2 norm
7070
*/
71-
protected static double Norm(int[] profile)
71+
private static double Norm(int[] profile)
7272
{
7373
double agg = 0;
7474

@@ -80,14 +80,14 @@ protected static double Norm(int[] profile)
8080
return Math.Sqrt(agg);
8181
}
8282

83-
protected static double DotProduct(int[] profile1, int[] profile2)
83+
private static double DotProduct(int[] profile1, int[] profile2)
8484
{
8585
int length = Math.Min(profile1.Length, profile2.Length);
8686

8787
double agg = 0;
8888
for (int i = 0; i < length; i++)
8989
{
90-
agg += profile1[i] * profile2[i];
90+
agg += (double)profile1[i] * profile2[i];
9191
}
9292
return agg;
9393
}

src/F23.StringSimilarity/Damerau.cs

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -40,82 +40,82 @@ namespace F23.StringSimilarity
4040
/// This is not to be confused with the optimal string alignment distance, which
4141
/// is an extension where no substring can be edited more than once.
4242
/// </summary>
43-
public class Damerau : IMetricStringDistance, IStringDistance
43+
public class Damerau : IMetricStringDistance
4444
{
4545
public double Distance(string s1, string s2)
4646
{
4747
// Infinite distance is the max possible distance
48-
int INFINITE = s1.Length + s2.Length;
48+
int inf = s1.Length + s2.Length;
4949

5050
// Create and initialize the character array indices
51-
var DA = new Dictionary<char, int>();
51+
var da = new Dictionary<char, int>();
5252

5353
for (int d = 0; d < s1.Length; d++)
5454
{
55-
if (!DA.ContainsKey(s1[d]))
55+
if (!da.ContainsKey(s1[d]))
5656
{
57-
DA[s1[d]] = 0;
57+
da[s1[d]] = 0;
5858
}
5959
}
6060

6161
for (int d = 0; d < s2.Length; d++)
6262
{
63-
if (!DA.ContainsKey(s2[d]))
63+
if (!da.ContainsKey(s2[d]))
6464
{
65-
DA[s2[d]] = 0;
65+
da[s2[d]] = 0;
6666
}
6767
}
6868

6969
// Create the distance matrix H[0 .. s1.length+1][0 .. s2.length+1]
70-
int[,] H = new int[s1.Length + 2, s2.Length + 2];
70+
int[,] h = new int[s1.Length + 2, s2.Length + 2];
7171

7272
// Initialize the left and top edges of H
7373
for (int i = 0; i <= s1.Length; i++)
7474
{
75-
H[i + 1, 0] = INFINITE;
76-
H[i + 1, 1] = i;
75+
h[i + 1, 0] = inf;
76+
h[i + 1, 1] = i;
7777
}
7878

7979
for (int j = 0; j <= s2.Length; j++)
8080
{
81-
H[0, j + 1] = INFINITE;
82-
H[1, j + 1] = j;
81+
h[0, j + 1] = inf;
82+
h[1, j + 1] = j;
8383
}
8484

8585
// Fill in the distance matrix H
8686
// Look at each character in s1
8787
for (int i = 1; i <= s1.Length; i++)
8888
{
89-
int DB = 0;
89+
int db = 0;
9090

9191
// Look at each character in b
9292
for (int j = 1; j <= s2.Length; j++)
9393
{
94-
int i1 = DA[s2[j - 1]];
95-
int j1 = DB;
94+
int i1 = da[s2[j - 1]];
95+
int j1 = db;
9696

9797
int cost = 1;
9898
if (s1[i - 1] == s2[j - 1])
9999
{
100100
cost = 0;
101-
DB = j;
101+
db = j;
102102
}
103103

104-
H[i + 1, j + 1] = Min(
105-
H[i, j] + cost, // Substitution
106-
H[i + 1, j] + 1, // Insertion
107-
H[i, j + 1] + 1, // Deletion
108-
H[i1, j1] + (i - i1 - 1) + 1 + (j - j1 - 1)
104+
h[i + 1, j + 1] = Min(
105+
h[i, j] + cost, // Substitution
106+
h[i + 1, j] + 1, // Insertion
107+
h[i, j + 1] + 1, // Deletion
108+
h[i1, j1] + (i - i1 - 1) + 1 + (j - j1 - 1)
109109
);
110110
}
111111

112-
DA[s1[i - 1]] = i;
112+
da[s1[i - 1]] = i;
113113
}
114114

115-
return H[s1.Length + 1, s2.Length + 1];
115+
return h[s1.Length + 1, s2.Length + 1];
116116
}
117117

118-
protected static int Min(int a, int b, int c, int d)
118+
private static int Min(int a, int b, int c, int d)
119119
=> Math.Min(a, Math.Min(b, Math.Min(c, d)));
120120
}
121121
}

src/F23.StringSimilarity/F23.StringSimilarity.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
<Compile Include="MetricLCS.cs" />
5757
<Compile Include="NGram.cs" />
5858
<Compile Include="NormalizedLevenshtein.cs" />
59+
<Compile Include="OptimalStringAlignment.cs" />
5960
<Compile Include="Properties\AssemblyInfo.cs" />
6061
<Compile Include="QGram.cs" />
6162
<Compile Include="ShingleBased.cs" />

src/F23.StringSimilarity/JaroWinkler.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ public class JaroWinkler : INormalizedStringSimilarity, INormalizedStringDistanc
4545
/// <summary>
4646
/// The current value of the threshold used for adding the Winkler bonus. The default value is 0.7.
4747
/// </summary>
48-
public double Threshold { get; private set; }
48+
private double Threshold { get; }
4949

5050
/// <summary>
5151
/// Creates a new instance with default threshold (0.7)

src/F23.StringSimilarity/MetricLCS.cs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,13 @@ public class MetricLCS : IMetricStringDistance, INormalizedStringDistance
4444
/// <returns>LCS distance metric</returns>
4545
public double Distance(string s1, string s2)
4646
{
47+
int mLen = Math.Max(s1.Length, s2.Length);
48+
49+
if (mLen == 0) return 0.0;
50+
4751
return 1.0
4852
- (1.0 * lcs.Length(s1, s2))
49-
/ Math.Max(s1.Length, s2.Length);
50-
53+
/ mLen;
5154
}
5255
}
5356
}

src/F23.StringSimilarity/NGram.cs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,10 @@ namespace F23.StringSimilarity
4040
/// </summary>
4141
public class NGram : INormalizedStringDistance
4242
{
43+
private const int DEFAULT_N = 2;
4344
private readonly int n;
4445

45-
public NGram() : this(2) { }
46+
public NGram() : this(DEFAULT_N) { }
4647

4748
public NGram(int n)
4849
{
@@ -83,7 +84,7 @@ public double Distance(string s0, string s1)
8384
char[] sa = new char[sl + n - 1];
8485
float[] p; // 'previous' cost array, horizontally
8586
float[] d; // Cost array, horizontally
86-
float[] _d; // Placeholder to assist in swapping p and d
87+
float[] d2; // Placeholder to assist in swapping p and d
8788

8889
// Construct sa with prefix
8990
for (int i1 = 0; i1 < sa.Length; i1++)
@@ -152,9 +153,9 @@ public double Distance(string s0, string s1)
152153
d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec);
153154
}
154155
// Copy current distance counts to 'previous row' distance counts
155-
_d = p;
156+
d2 = p;
156157
p = d;
157-
d = _d;
158+
d = d2;
158159
}
159160

160161
// Our last action in the above loop was to switch d and p, so p now

src/F23.StringSimilarity/NormalizedLevenshtein.cs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,13 @@ public class NormalizedLevenshtein : INormalizedStringDistance, INormalizedStrin
4343
/// <returns>The normalized Levenshtein distance</returns>
4444
////
4545
public double Distance(string s1, string s2)
46-
=> l.Distance(s1, s2) / Math.Max(s1.Length, s2.Length);
46+
{
47+
int mLen = Math.Max(s1.Length, s2.Length);
48+
49+
if (mLen == 0) return 0.0;
50+
51+
return l.Distance(s1, s2) / mLen;
52+
}
4753

4854
/// <summary>
4955
/// Return 1 - distance.
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright 2016 feature[23]
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
25+
using System;
26+
using F23.StringSimilarity.Interfaces;
27+
28+
namespace F23.StringSimilarity
29+
{
30+
public sealed class OptimalStringAlignment : IStringDistance
31+
{
32+
/// <summary>
33+
/// Compute the distance between strings: the minimum number of operations
34+
/// needed to transform one string into the other (insertion, deletion,
35+
/// substitution of a single character, or a transposition of two adjacent
36+
/// characters) while no substring is edited more than once.
37+
/// </summary>
38+
/// <param name="s1">the first input string</param>
39+
/// <param name="s2">the second input string</param>
40+
/// <returns>the OSA distance</returns>
41+
public double Distance(string s1, string s2)
42+
{
43+
int n = s1.Length, m = s2.Length;
44+
if (n == 0) return m;
45+
if (m == 0) return n;
46+
47+
// Create the distance matrix H[0 .. s1.length+1][0 .. s2.length+1]
48+
int[,] d = new int[s1.Length + 2, s2.Length + 2];
49+
50+
//initialize top row and leftmost column
51+
for (int i = 0; i <= n; i++)
52+
{
53+
d[i, 0] = i;
54+
}
55+
for (int j = 0; j <= m; j++)
56+
{
57+
d[0, j] = j;
58+
}
59+
60+
//fill the distance matrix
61+
int cost;
62+
63+
for (int i = 1; i <= n; i++)
64+
{
65+
for (int j = 1; j <= m; j++)
66+
{
67+
68+
//if s1[i - 1] = s2[j - 1] then cost = 0, else cost = 1
69+
cost = (s1[i - 1] == s2[j - 1]) ? 0 : 1;
70+
71+
d[i, j] = Min(
72+
d[i - 1, j - 1] + cost, // substitution
73+
d[i, j - 1] + 1, // insertion
74+
d[i - 1, j] + 1 // deletion
75+
);
76+
77+
//transposition check
78+
if (i > 1 && j > 1
79+
&& s1[i - 1] == s2[j - 2]
80+
&& s1[i - 2] == s2[j - 1]
81+
)
82+
{
83+
d[i, j] = Math.Min(d[i, j], d[i - 2, j - 2] + cost);
84+
}
85+
}
86+
}
87+
88+
return d[n, m];
89+
}
90+
91+
private static int Min(int a, int b, int c)
92+
=> Math.Min(a, Math.Min(b, c));
93+
}
94+
}

src/F23.StringSimilarity/ShingleBased.cs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,25 @@ namespace F23.StringSimilarity
2626
{
2727
public abstract class ShingleBased
2828
{
29-
public int k { get; }
29+
private const int DEFAULT_K = 3;
30+
31+
/// <summary>
32+
/// Return k, the length of k-shingles (aka n-grams).
33+
/// </summary>
34+
protected int k { get; }
3035

3136
/// <summary>
3237
///
3338
/// </summary>
3439
/// <param name="k"></param>
35-
public ShingleBased(int k)
40+
protected ShingleBased(int k)
3641
{
3742
this.k = k;
3843
}
3944

4045
/// <summary>
4146
///
4247
/// </summary>
43-
public ShingleBased() : this(3) { }
48+
protected ShingleBased() : this(DEFAULT_K) { }
4449
}
4550
}

src/F23.StringSimilarity/StringSet.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ public double SorensenDiceSimilarity(StringSet other)
5555
throw new Exception("Profiles were not created using the same kshingling object!");
5656
}
5757

58-
return 2 * _vector.Intersection(other._vector) / (_vector.Size + other._vector.Size);
58+
return 2.0 * _vector.Intersection(other._vector) / (_vector.Size + other._vector.Size);
5959
}
6060
}
6161
}

0 commit comments

Comments
 (0)