Skip to content

Commit f822944

Browse files
authored
Initial public release (#15)
* Initial public release
1 parent 504460e commit f822944

File tree

12 files changed

+203
-23
lines changed

12 files changed

+203
-23
lines changed

Project.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,17 @@ version = "0.1.0"
77
CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"
88
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
99
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
10-
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
1110

1211
[compat]
12+
CircularArrays = "1"
13+
DataStructures = "0.18"
14+
OffsetArrays = "1"
1315
julia = "1"
1416

1517
[extras]
1618
Faker = "0efc519c-db33-5916-ab87-703215c3906f"
1719
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
20+
Suppressor = "fd094767-a336-5f1f-9728-57cf17d0bbfb"
1821

1922
[targets]
20-
test = ["Test", "Faker"]
23+
test = ["Test", "Faker", "Suppressor"]

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,16 @@ This package is be particulary useful for natural language processing tasks whic
1717
- [X] Support for unicodes
1818
- [ ] Custom user defined feature generation methods
1919
- [ ] Mecab-based tokenizer support
20+
- [X] Support for building databases directly from text files
21+
- [ ] Support for persistent databases
2022

2123
## Suported String Similarity Measures
2224

2325
- [X] Dice coefficient
2426
- [X] Jaccard coefficient
2527
- [X] Cosine coefficient
2628
- [X] Overlap coefficient
29+
- [X] Exact match
2730

2831
## Installation
2932

docs/src/index.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ This package is be particulary useful for natural language processing tasks whic
1616
- [X] Support for unicodes
1717
- [ ] Custom user defined feature generation methods
1818
- [ ] Mecab-based tokenizer support
19-
- [ ] Support for building databases directly from text files
19+
- [X] Support for building databases directly from text files
2020
- [ ] Support for persistent databases
2121

2222
## Suported String Similarity Measures
@@ -64,6 +64,8 @@ push!(db, "fooo");
6464

6565
# Convinient approach is to use an array of strings for multiple entries: `append!(db, ["foo", "bar", "fooo"]);`
6666

67+
# OR: Build database from text files: `append!(db, "YOUR_FILE_NAME.txt");
68+
6769
# Retrieve the closest match(es)
6870
res = search(Dice(), db, "foo"; α=0.8, ranked=true)
6971
# 2-element Vector{Tuple{String, Float64}}:
@@ -72,7 +74,7 @@ res = search(Dice(), db, "foo"; α=0.8, ranked=true)
7274

7375
# Describe a working database collection
7476
desc = describe_collection(db)
75-
# (total_collection = 3, avg_num_ngrams = 4.5, total_ngrams = 13)
77+
# (total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
7678
```
7779

7880
## TODO: Benchmarks

extras/benchmark_sim.jl

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
using SimString
2+
using Faker
3+
using BenchmarkTools
4+
using DataStructures
5+
6+
################################# Benchmark Bulk addition #####################
7+
db = DictDB(CharacterNGrams(3, " "));
8+
Faker.seed(2020)
9+
@time fake_names = [string(Faker.first_name(), " ", Faker.last_name()) for i in 1:100_000];
10+
11+
12+
f(d, x) = append!(d, x)
13+
@time f(db, fake_names)
14+
15+
16+
17+
################################ Simple Addition ###############################
18+
19+
db = DictDB(CharacterNGrams(2, " "));
20+
push!(db, "foo");
21+
push!(db, "bar");
22+
push!(db, "fooo");
23+
24+
f(x, c, s, a, r) = search(x, c, s; α=a, ranked=r)
25+
test = "foo";
26+
col = db;
27+
sim = Cosine();
28+
a = 0.8;
29+
r = true;
30+
31+
f(Cosine(), db, "foo", 0.8, true)
32+
33+
@btime f($sim, $col, $test, $a, $r)
34+
@btime search(Cosine(), db, "foo"; α=0.8, ranked=true)
35+
36+
37+
38+
db2 = DictDB(CharacterNGrams(3, " "));
39+
append!(db2, ["foo", "bar", "fooo", "foor"]) # also works via multiple dispatch on a vector
40+
41+
results = search(Cosine(), db, "foo"; α=0.8, ranked=true) # yet to be implemented
42+
43+
bs = ["foo", "bar", "foo", "foo", "bar"]
44+
SimString.extract_features(CharacterNGrams(3, " "), "prepress")
45+
SimString.extract_features(WordNGrams(2, " ", " "), "You are a really really really cool dude.")
46+
47+
db = DictDB(WordNGrams(2, " ", " "))
48+
push!(db, "You are a really really really cool dude.")

src/SimString.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ module SimString
22

33
import Base: push!, append!
44
using DataStructures: DefaultOrderedDict, DefaultDict
5-
using ProgressMeter
65
using CircularArrays
76
using OffsetArrays
87

src/dictdb.jl

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ Basic summary stats for the DB
8787
db = DictDB(CharacterNGrams(2, " "));
8888
append!(db, ["foo", "bar", "fooo"]);
8989
describe_collection(db)
90+
(total_collection = 3, avg_size_ngrams = 4.5, total_ngrams = 13)
9091
9192
# Returns
9293
* NamedTuples: Summary stats for the DB
@@ -98,7 +99,7 @@ function describe_collection(db::DictDB)
9899
# Total number of strings in collection
99100
= length(db.string_collection)
100101

101-
# Average number of ngram features
102+
# Average size of ngram features
102103
n = [x for x in keys(db.string_size_map)]
103104
μ = sum(n) / length(n)
104105

@@ -108,7 +109,19 @@ for i in values(db.string_feature_map)
108109
total_ngrams += length(i)
109110
end
110111

111-
return (total_collection = ∑, avg_num_ngrams = μ, total_ngrams = total_ngrams)
112+
return (total_collection = ∑, avg_size_ngrams = μ, total_ngrams = total_ngrams)
113+
end
114+
115+
116+
"""
117+
Pretty print summary stats for the DB
118+
"""
119+
function Base.show(io::IO, x::DictDB)
120+
metrics = describe_collection(x)
121+
println(io, "DictDB($(x.feature_extractor))")
122+
println(io, "Total collection: ", metrics.total_collection)
123+
println(io, "Average number of ngram features: ", metrics.avg_size_ngrams)
124+
println(io, "Total number of ngram features: ", metrics.total_ngrams)
112125
end
113126

114127

src/features.jl

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,25 @@ end
9999

100100

101101
"""
102+
push!(db::AbstractSimStringDB, str::AbstractString)
103+
102104
Add a new item to a new or existing collection of strings using
103105
the custom AbstractSimStringDB type.
106+
107+
# Arguments:
108+
* `db`: AbstractSimStringDB - The collection of strings to add to
109+
* `str`: AbstractString - The string to add to the collection
110+
111+
# Example:
112+
```julia
113+
db = DictDB(CharacterNGrams(2, " "));
114+
push!(db, "foo")
115+
push!(db, "bar")
116+
push!(db, "fooo")
117+
````
118+
119+
# Returns:
120+
* `db`: AbstractSimStringDB - The collection of strings with the new string added
104121
"""
105122
function push!(db::AbstractSimStringDB, str::AbstractString)
106123
# Extract features based on the specified feature extractor
@@ -125,11 +142,54 @@ end
125142

126143

127144
"""
145+
append!(db::AbstractSimStringDB, str::Vector)
146+
128147
Add bulk items to a new or existing collection of strings using
129148
the custom AbstractSimStringDB type.
149+
150+
# Arguments:
151+
* db: AbstractSimStringDB - The database to add the strings to
152+
* str: Vector of AbstractString - Vector/Array of strings to add to the database
153+
154+
# Example:
155+
```julia
156+
db = DictDB(CharacterNGrams(2, " "));
157+
append!(db, ["foo", "foo", "fooo"]);
158+
```
159+
160+
# Returns:
161+
* db: AbstractSimStringDB - The database with the new strings added
130162
"""
131163
function append!(db::AbstractSimStringDB, str::Vector)
132164
@inbounds @simd for i in str
133165
push!(db, i)
134166
end
167+
end
168+
169+
170+
"""
171+
append!(db::AbstractSimStringDB, file::AbstractString)
172+
173+
Add bulk items to a new or existing collection of strings using
174+
from a file using the custom AbstractSimStringDB type.
175+
176+
# Arguments:
177+
* `db``: AbstractSimStringDB - The database to add the items to
178+
* `file`: AbstractString - Path to the file to read from
179+
180+
# Example:
181+
```julia
182+
db = DictDB(CharacterNGrams(2, " "));
183+
append!(db, "./data/test.txt")
184+
```
185+
186+
# Returns:
187+
* `db`: AbstractSimStringDB - The database with the items added
188+
"""
189+
function append!(db::AbstractSimStringDB, file::AbstractString)
190+
open(file) do f
191+
for line in eachline(f)
192+
push!(db, line)
193+
end
194+
end
135195
end

src/search.jl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ function overlap_join(db_collection::AbstractSimStringDB, features, τ, candidat
7474
results = String[]
7575

7676
for (candidate, match_count) in candidate_match_counts
77-
for i in (query_feature_length - τ + 1) : query_feature_length # TODO: Verify
77+
for i in (query_feature_length - τ + 1) : query_feature_length
7878
if candidate in lookup_feature_set_by_size_feature(db_collection, candidate_size, features[i])
7979
match_count += 1
8080
end
@@ -103,16 +103,16 @@ function search!(measure::AbstractSimilarityMeasure, db_collection::DictDB, quer
103103
features = extract_features(db_collection.feature_extractor, query)
104104

105105
# Metadata from the generated features (length, min & max sizes)
106-
length_of_features = length(features)
107-
min_feature_size = minimum_feature_size(measure, length_of_features, α)
108-
max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)
106+
# length_of_features = length(features)
107+
# min_feature_size = minimum_feature_size(measure, length_of_features, α)
108+
# max_feature_size = maximum_feature_size(measure, db_collection, length_of_features, α)
109109

110110
results = String[]
111111

112112
# Generate and return results from the potential candidate size pool
113-
@inbounds for candidate_size in min_feature_size:max_feature_size
113+
@inbounds for candidate_size in minimum_feature_size(measure, length(features), α) : maximum_feature_size(measure, db_collection, length(features), α)
114114
# Minimum overlap
115-
τ = minimum_overlap(measure, length_of_features, candidate_size, α)
115+
τ = minimum_overlap(measure, length(features), candidate_size, α)
116116

117117
# Generate approximate candidates from the overlap join
118118
append!(results, overlap_join(db_collection, features, τ, candidate_size))

test/dummy_sents.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
You are a really really really cool dude.
2+
Sometimes you are not really really cool tho

test/dummy_words.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
foo
2+
bar
3+
fooo

0 commit comments

Comments
 (0)