|
| 1 | +#%%silent |
| 2 | +#%%restart |
| 3 | +#%%cd @ |
| 4 | + |
| 5 | +# Copyright (C) 2020, Marek Gagolewski, https://www.gagolewski.com |
| 6 | +# |
| 7 | +# Permission is hereby granted, free of charge, to any person obtaining a copy |
| 8 | +# of this software and associated documentation files (the "Software"), to deal |
| 9 | +# in the Software without restriction, including without limitation the rights |
| 10 | +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 11 | +# copies of the Software, and to permit persons to whom the Software is |
| 12 | +# furnished to do so, subject to the following conditions: |
| 13 | +# |
| 14 | +# The above copyright notice and this permission notice shall be included in all |
| 15 | +# copies or substantial portions of the Software. |
| 16 | +# |
| 17 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 18 | +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 19 | +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 20 | +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 21 | +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 22 | +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 23 | +# SOFTWARE. |
| 24 | + |
| 25 | + |
| 26 | + |
| 27 | +############################################################################## |
| 28 | + |
| 29 | +# "https://github.com/gagolews/clustering_benchmarks_v1" |
| 30 | +benchmarks_path = "." |
| 31 | +save_csv = False |
| 32 | +preprocess = ["none", "std", "robuststd"][1] |
| 33 | +folders = ["wut", "sipu", "other", "fcps", "graves", "mnist", "uci", "g2mg", "h2mg"][-2:] |
| 34 | +method = ["genieclust"][0] |
| 35 | + |
| 36 | +############################################################################## |
| 37 | + |
| 38 | +import sys |
| 39 | +sys.path.append(benchmarks_path) |
| 40 | +from load_dataset import load_dataset |
| 41 | +import numpy as np |
| 42 | +import pandas as pd |
| 43 | +import matplotlib.pyplot as plt |
| 44 | +import os.path, glob, re, csv |
| 45 | +from natsort import natsorted |
| 46 | +import genieclust |
| 47 | +import sklearn.metrics |
| 48 | +import seaborn as sns |
| 49 | +np.set_printoptions(precision=5, threshold=10, edgeitems=10) |
| 50 | +pd.set_option("min_rows", 20) |
| 51 | +plt.style.use("seaborn-whitegrid") |
| 52 | +#plt.rcParams["figure.figsize"] = (8,4) |
| 53 | + |
| 54 | + |
| 55 | + |
| 56 | + |
| 57 | + |
| 58 | +def get_metrics(labels_true, labels_pred): |
| 59 | + # disregard noise points from counting |
| 60 | + # noise cluster == 0 |
| 61 | + labels_pred = labels_pred[labels_true>0] |
| 62 | + labels_true = labels_true[labels_true>0] |
| 63 | + return {**genieclust.compare_partitions.compare_partitions2(labels_true, labels_pred)} |
| 64 | + |
| 65 | + |
| 66 | + |
| 67 | + |
| 68 | +def do_benchmark_genie(res, genie, X, labels_true, K, params): |
| 69 | + for M in sorted([1, 3, 5, 9, 15, 25])[::-1]: # decreasing M => NNs are reused |
| 70 | + for g in [0.1, 0.3, 0.5, 0.7, 1.0]: |
| 71 | + genie.set_params(n_clusters=K, |
| 72 | + gini_threshold=g, M=M, postprocess="all") |
| 73 | + labels_pred = genie.fit_predict(X) |
| 74 | + res.append({ |
| 75 | + **params, |
| 76 | + "method": "Genie_g%.1f:M%d"%(g,M), |
| 77 | + **get_metrics(labels_true, labels_pred) |
| 78 | + }) |
| 79 | + print(".", end="") |
| 80 | + print(" ") |
| 81 | + |
| 82 | + |
| 83 | + |
| 84 | +def do_benchmark_gic(res, gic, X, labels_true, K, params): |
| 85 | + for M in sorted([1, 3, 5, 9, 15, 25])[::-1]: # decreasing M => NNs are reused |
| 86 | + for add in [5, 1, 0]: |
| 87 | + for g in [np.r_[0.3, 0.5, 0.7], np.linspace(0.0, 1.0, 11), []]: |
| 88 | + if len(g) == 0 and add > 0: continue |
| 89 | + |
| 90 | + gic.set_params(n_clusters=K, |
| 91 | + gini_thresholds=g, add_clusters=add, M=M, postprocess="all") |
| 92 | + labels_pred = gic.fit_predict(X) |
| 93 | + res.append({ |
| 94 | + **params, |
| 95 | + "method": "GIc_A%d_TC%d:M%d"%(add,len(g),M), |
| 96 | + **get_metrics(labels_true, labels_pred) |
| 97 | + }) |
| 98 | + print(".", end="") |
| 99 | + print(" ") |
| 100 | + |
| 101 | + |
| 102 | +def benchmark(dataset, benchmarks_path, method, preprocess="none"): |
| 103 | + """ |
| 104 | + Processes a single benchmark dataset. |
| 105 | +
|
| 106 | + preprocess is one of "none", "std", "robuststd", |
| 107 | + where the latter is (x-median(x))/mad(x) |
| 108 | + """ |
| 109 | + np.random.seed(123) |
| 110 | + X = np.loadtxt(os.path.join(benchmarks_path, dataset+".data.gz"), ndmin=2) |
| 111 | + |
| 112 | + X = X[:, X.var(axis=0) > 0] # remove all columns of 0 variance |
| 113 | + |
| 114 | + if preprocess == "std": # mean/sd |
| 115 | + s = X.std(axis=0, ddof=1) |
| 116 | + X = (X-X.mean(axis=0))/s |
| 117 | + elif preprocess == "robuststd": # median/MAD |
| 118 | + s = np.median(np.abs(X-np.median(X, axis=0)), axis=0) |
| 119 | + s[s<1e-12] = 1.0 # don't scale columns of zero MAD |
| 120 | + X = (X-np.median(X, axis=0)) |
| 121 | + else: |
| 122 | + s = X.std(axis=None, ddof=1) # scale all axes proportionally |
| 123 | + X = (X-X.mean(axis=0)) |
| 124 | + |
| 125 | + |
| 126 | + X += np.random.normal(0.0, 1e-9, size=X.shape) # add a tiny bit of noise |
| 127 | + X = X.astype(np.float32, order="C", copy=False) # work with float32 |
| 128 | + |
| 129 | + |
| 130 | + print("## %s preprocess=%s (n=%d, d=%d)" % |
| 131 | + (dataset, preprocess, X.shape[0], X.shape[1])) |
| 132 | + |
| 133 | + label_names = sorted([re.search(r"\.(labels[0-9]+)\.gz", name).group(1) |
| 134 | + for name |
| 135 | + in glob.glob(os.path.join(benchmarks_path, dataset+".labels*.gz"))]) |
| 136 | + label_fnames = [os.path.join(benchmarks_path, "%s.%s.gz" % (dataset,name)) |
| 137 | + for name in label_names] |
| 138 | + labels = [np.loadtxt(fname, dtype="int") for fname in label_fnames] |
| 139 | + |
| 140 | + res = [] |
| 141 | + |
| 142 | + if method == "genieclust": |
| 143 | + genie = genieclust.Genie(compute_full_tree=False) |
| 144 | + gic = genieclust.GIc(compute_full_tree=False) |
| 145 | + |
| 146 | + |
| 147 | + for i in range(len(label_names)): |
| 148 | + params = dict( |
| 149 | + dataset=dataset, |
| 150 | + preprocess=preprocess, |
| 151 | + labels=label_names[i] |
| 152 | + ) |
| 153 | + |
| 154 | + labels_true = labels[i] |
| 155 | + labels_true_counts = np.bincount(labels_true)[1:] # noise cluster == 0 |
| 156 | + K = len(labels_true_counts) |
| 157 | + |
| 158 | + # 1. find a K-partition of X --> labels_pred |
| 159 | + # 2. res.append({ |
| 160 | + # **params, |
| 161 | + # "method": "METHOD_NAME:param1:param2:etc.", |
| 162 | + # **get_metrics(labels_true, labels_pred) |
| 163 | + # }) |
| 164 | + |
| 165 | + if method == "genieclust": |
| 166 | + do_benchmark_genie(res, genie, X, labels_true, K, params) |
| 167 | + do_benchmark_gic(res, gic, X, labels_true, K, params) |
| 168 | + |
| 169 | + |
| 170 | + return res |
| 171 | + |
| 172 | + |
| 173 | +for folder in folders: |
| 174 | + fnames = glob.glob(os.path.join(benchmarks_path, folder, "*.data.gz")) |
| 175 | + datasets = natsorted([re.search(r"([^/]*/[^/]*)\.data\.gz", name)[1] |
| 176 | + for name in fnames]) |
| 177 | + |
| 178 | + res = [] |
| 179 | + for dataset in datasets: |
| 180 | + res += benchmark(dataset, benchmarks_path, |
| 181 | + method=method, preprocess=preprocess) |
| 182 | + |
| 183 | + res_df = pd.DataFrame(res) |
| 184 | + |
| 185 | + if save_csv: |
| 186 | + res_df.to_csv("results/v1_%s_%s_%s.csv"%(folder,preprocess,method), |
| 187 | + index=False, quoting=csv.QUOTE_NONNUMERIC) |
| 188 | + |
0 commit comments