Skip to content

Commit c282486

Browse files
committed
genieclust benchmark results
1 parent c667669 commit c282486

File tree

2 files changed

+288
-0
lines changed

2 files changed

+288
-0
lines changed

benchmark_analyse.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#%%silent
2+
#%%restart
3+
#%%cd @
4+
5+
# Copyright (C) 2020, Marek Gagolewski, https://www.gagolewski.com
6+
#
7+
# Permission is hereby granted, free of charge, to any person obtaining a copy
8+
# of this software and associated documentation files (the "Software"), to deal
9+
# in the Software without restriction, including without limitation the rights
10+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11+
# copies of the Software, and to permit persons to whom the Software is
12+
# furnished to do so, subject to the following conditions:
13+
#
14+
# The above copyright notice and this permission notice shall be included in all
15+
# copies or substantial portions of the Software.
16+
#
17+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23+
# SOFTWARE.
24+
25+
26+
27+
# "https://github.com/gagolews/clustering_benchmarks_v1"
28+
benchmarks_path = "."
29+
import sys
30+
sys.path.append(benchmarks_path)
31+
from load_dataset import load_dataset
32+
import numpy as np
33+
import pandas as pd
34+
import matplotlib.pyplot as plt
35+
import os.path, glob, re
36+
from natsort import natsorted
37+
import genieclust
38+
import sklearn.metrics
39+
import seaborn as sns
40+
np.set_printoptions(precision=5, threshold=10, edgeitems=10)
41+
pd.set_option("min_rows", 20)
42+
plt.style.use("seaborn-whitegrid")
43+
#plt.rcParams["figure.figsize"] = (8,4)
44+
45+
46+
47+
res = pd.read_csv("results/v1_wut_none_genieclust.csv")
48+
49+
50+
res_max = res.loc[(res.preprocess=="none") & res.method.isin(["GIc_A0_TC3", "Genie_g0.3"]) &
51+
(~res.dataset.str.contains("2mg")),:].\
52+
groupby(["dataset", "method", "preprocess", "M"]).max().\
53+
reset_index().drop(["k", "g", "noise", "labels"], axis=1)
54+
55+
56+
res_summary_ar = res_max.groupby(["method", "preprocess", "M"]).ar.\
57+
mean().sort_values(ascending=False).rename("mean").\
58+
reset_index()
59+
print(res_summary_ar)
60+
61+
res_summary_ar = res_max.groupby(["method", "preprocess", "M"]).ar.\
62+
median().sort_values(ascending=False).rename("median").\
63+
reset_index()
64+
print(res_summary_ar)
65+
66+
67+
#plt.rcParams["figure.figsize"] = (12,4)
68+
#plt.subplot("131")
69+
#sns.boxplot(y="method", x="ar", data=res_max.loc[res_max.preprocess=="none",:], orient="h")
70+
#plt.subplot("132")
71+
#sns.boxplot(y="method", x="ar", data=res_max.loc[res_max.preprocess=="standardise",:], orient="h")
72+
#plt.subplot("133")
73+
#sns.boxplot(y="method", x="ar", data=res_max.loc[res_max.preprocess=="standardise_robust",:], orient="h")
74+
75+
plt.rcParams["figure.figsize"] = (12,8)
76+
sns.boxplot(y="method", x="ar", hue="M", data=res_max, orient="h")
77+
plt.show()
78+
79+
80+
plt.rcParams["figure.figsize"] = (8,6)
81+
res_max2 = res.copy()
82+
res_max2["preprocess_M"] = res_max2.preprocess+"_"+res_max2.M.astype(str)
83+
res_max2 = res_max2.loc[(~res.dataset.str.contains("2mg")),:].\
84+
groupby(["dataset", "method", "preprocess_M"]).max().\
85+
reset_index().drop(["k", "g", "noise", "labels"], axis=1)
86+
res_summary_ar2 = res_max2.groupby(["method", "preprocess_M"]).ar.\
87+
mean().sort_values(ascending=False).rename("mean").unstack()
88+
sns.heatmap(res_summary_ar2, annot=True, fmt=".2f", vmin=0.5, vmax=1.0)
89+
plt.title("Mean ARI")
90+
plt.show()
91+
res_max2 = res.copy()
92+
res_max2["preprocess_M"] = res_max2.preprocess+"_"+res_max2.M.astype(str)
93+
res_max2 = res_max2.\
94+
groupby(["dataset", "method", "preprocess_M"]).max().\
95+
reset_index().drop(["k", "g", "noise", "labels"], axis=1)
96+
res_summary_ar2 = res_max2.groupby(["method", "preprocess_M"]).ar.\
97+
median().sort_values(ascending=False).rename("median").unstack()
98+
sns.heatmap(res_summary_ar2, annot=True, fmt=".2f", vmin=0.5, vmax=1.0)
99+
plt.title("Median ARI")
100+
plt.show()

benchmark_compute.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
#%%silent
2+
#%%restart
3+
#%%cd @
4+
5+
# Copyright (C) 2020, Marek Gagolewski, https://www.gagolewski.com
6+
#
7+
# Permission is hereby granted, free of charge, to any person obtaining a copy
8+
# of this software and associated documentation files (the "Software"), to deal
9+
# in the Software without restriction, including without limitation the rights
10+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11+
# copies of the Software, and to permit persons to whom the Software is
12+
# furnished to do so, subject to the following conditions:
13+
#
14+
# The above copyright notice and this permission notice shall be included in all
15+
# copies or substantial portions of the Software.
16+
#
17+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23+
# SOFTWARE.
24+
25+
26+
27+
##############################################################################
28+
29+
# "https://github.com/gagolews/clustering_benchmarks_v1"
30+
benchmarks_path = "."
31+
save_csv = False
32+
preprocess = ["none", "std", "robuststd"][1]
33+
folders = ["wut", "sipu", "other", "fcps", "graves", "mnist", "uci", "g2mg", "h2mg"][-2:]
34+
method = ["genieclust"][0]
35+
36+
##############################################################################
37+
38+
import sys
39+
sys.path.append(benchmarks_path)
40+
from load_dataset import load_dataset
41+
import numpy as np
42+
import pandas as pd
43+
import matplotlib.pyplot as plt
44+
import os.path, glob, re, csv
45+
from natsort import natsorted
46+
import genieclust
47+
import sklearn.metrics
48+
import seaborn as sns
49+
np.set_printoptions(precision=5, threshold=10, edgeitems=10)
50+
pd.set_option("min_rows", 20)
51+
plt.style.use("seaborn-whitegrid")
52+
#plt.rcParams["figure.figsize"] = (8,4)
53+
54+
55+
56+
57+
58+
def get_metrics(labels_true, labels_pred):
59+
# disregard noise points from counting
60+
# noise cluster == 0
61+
labels_pred = labels_pred[labels_true>0]
62+
labels_true = labels_true[labels_true>0]
63+
return {**genieclust.compare_partitions.compare_partitions2(labels_true, labels_pred)}
64+
65+
66+
67+
68+
def do_benchmark_genie(res, genie, X, labels_true, K, params):
69+
for M in sorted([1, 3, 5, 9, 15, 25])[::-1]: # decreasing M => NNs are reused
70+
for g in [0.1, 0.3, 0.5, 0.7, 1.0]:
71+
genie.set_params(n_clusters=K,
72+
gini_threshold=g, M=M, postprocess="all")
73+
labels_pred = genie.fit_predict(X)
74+
res.append({
75+
**params,
76+
"method": "Genie_g%.1f:M%d"%(g,M),
77+
**get_metrics(labels_true, labels_pred)
78+
})
79+
print(".", end="")
80+
print(" ")
81+
82+
83+
84+
def do_benchmark_gic(res, gic, X, labels_true, K, params):
85+
for M in sorted([1, 3, 5, 9, 15, 25])[::-1]: # decreasing M => NNs are reused
86+
for add in [5, 1, 0]:
87+
for g in [np.r_[0.3, 0.5, 0.7], np.linspace(0.0, 1.0, 11), []]:
88+
if len(g) == 0 and add > 0: continue
89+
90+
gic.set_params(n_clusters=K,
91+
gini_thresholds=g, add_clusters=add, M=M, postprocess="all")
92+
labels_pred = gic.fit_predict(X)
93+
res.append({
94+
**params,
95+
"method": "GIc_A%d_TC%d:M%d"%(add,len(g),M),
96+
**get_metrics(labels_true, labels_pred)
97+
})
98+
print(".", end="")
99+
print(" ")
100+
101+
102+
def benchmark(dataset, benchmarks_path, method, preprocess="none"):
103+
"""
104+
Processes a single benchmark dataset.
105+
106+
preprocess is one of "none", "std", "robuststd",
107+
where the latter is (x-median(x))/mad(x)
108+
"""
109+
np.random.seed(123)
110+
X = np.loadtxt(os.path.join(benchmarks_path, dataset+".data.gz"), ndmin=2)
111+
112+
X = X[:, X.var(axis=0) > 0] # remove all columns of 0 variance
113+
114+
if preprocess == "std": # mean/sd
115+
s = X.std(axis=0, ddof=1)
116+
X = (X-X.mean(axis=0))/s
117+
elif preprocess == "robuststd": # median/MAD
118+
s = np.median(np.abs(X-np.median(X, axis=0)), axis=0)
119+
s[s<1e-12] = 1.0 # don't scale columns of zero MAD
120+
X = (X-np.median(X, axis=0))
121+
else:
122+
s = X.std(axis=None, ddof=1) # scale all axes proportionally
123+
X = (X-X.mean(axis=0))
124+
125+
126+
X += np.random.normal(0.0, 1e-9, size=X.shape) # add a tiny bit of noise
127+
X = X.astype(np.float32, order="C", copy=False) # work with float32
128+
129+
130+
print("## %s preprocess=%s (n=%d, d=%d)" %
131+
(dataset, preprocess, X.shape[0], X.shape[1]))
132+
133+
label_names = sorted([re.search(r"\.(labels[0-9]+)\.gz", name).group(1)
134+
for name
135+
in glob.glob(os.path.join(benchmarks_path, dataset+".labels*.gz"))])
136+
label_fnames = [os.path.join(benchmarks_path, "%s.%s.gz" % (dataset,name))
137+
for name in label_names]
138+
labels = [np.loadtxt(fname, dtype="int") for fname in label_fnames]
139+
140+
res = []
141+
142+
if method == "genieclust":
143+
genie = genieclust.Genie(compute_full_tree=False)
144+
gic = genieclust.GIc(compute_full_tree=False)
145+
146+
147+
for i in range(len(label_names)):
148+
params = dict(
149+
dataset=dataset,
150+
preprocess=preprocess,
151+
labels=label_names[i]
152+
)
153+
154+
labels_true = labels[i]
155+
labels_true_counts = np.bincount(labels_true)[1:] # noise cluster == 0
156+
K = len(labels_true_counts)
157+
158+
# 1. find a K-partition of X --> labels_pred
159+
# 2. res.append({
160+
# **params,
161+
# "method": "METHOD_NAME:param1:param2:etc.",
162+
# **get_metrics(labels_true, labels_pred)
163+
# })
164+
165+
if method == "genieclust":
166+
do_benchmark_genie(res, genie, X, labels_true, K, params)
167+
do_benchmark_gic(res, gic, X, labels_true, K, params)
168+
169+
170+
return res
171+
172+
173+
for folder in folders:
174+
fnames = glob.glob(os.path.join(benchmarks_path, folder, "*.data.gz"))
175+
datasets = natsorted([re.search(r"([^/]*/[^/]*)\.data\.gz", name)[1]
176+
for name in fnames])
177+
178+
res = []
179+
for dataset in datasets:
180+
res += benchmark(dataset, benchmarks_path,
181+
method=method, preprocess=preprocess)
182+
183+
res_df = pd.DataFrame(res)
184+
185+
if save_csv:
186+
res_df.to_csv("results/v1_%s_%s_%s.csv"%(folder,preprocess,method),
187+
index=False, quoting=csv.QUOTE_NONNUMERIC)
188+

0 commit comments

Comments
 (0)