|
| 1 | +(ns orchard.inspect.analytics |
| 2 | + "Submodule of Orchard Inspector for getting quick insights about the inspected |
| 3 | + data. A \"Metabase\" for Orchard/CIDER Inspector." |
| 4 | + (:refer-clojure :exclude [bounded-count]) |
| 5 | + (:require |
| 6 | + [clojure.string :as str]) |
| 7 | + (:import |
| 8 | + (java.util List Map))) |
| 9 | + |
| 10 | +;; To keep execution time under control, only calculate analytics for the first |
| 11 | +;; 100k elements. |
| 12 | +(def ^:dynamic *size-cutoff* 100000) |
| 13 | + |
| 14 | +(defn- non-nil-hmap [& keyvals] |
| 15 | + (->> (partition 2 keyvals) |
| 16 | + (keep #(when (some? (second %)) (vec %))) |
| 17 | + (into {}))) |
| 18 | + |
| 19 | +(defn- *frequencies [coll] |
| 20 | + (->> coll |
| 21 | + (eduction (take *size-cutoff*)) |
| 22 | + frequencies |
| 23 | + (sort-by second >) |
| 24 | + (apply concat) |
| 25 | + (apply array-map))) |
| 26 | + |
| 27 | +(definline ^:private inc-if [val condition] |
| 28 | + `(cond-> ~val ~condition inc)) |
| 29 | + |
| 30 | +(defn- count-pred [pred limit ^Iterable coll] |
| 31 | + (let [it (.iterator ^Iterable coll)] |
| 32 | + (loop [i 0, n 0] |
| 33 | + (if (and (< i limit) (.hasNext it)) |
| 34 | + (let [x (.next it)] |
| 35 | + (recur (inc i) (inc-if n (pred x)))) |
| 36 | + [n (/ n i)])))) |
| 37 | + |
| 38 | +(defn- bounded-count [limit coll] |
| 39 | + (first (count-pred (constantly true) limit coll))) |
| 40 | + |
| 41 | +(defn- list-of-tuples? |
| 42 | + "Heuristic-based: an sequence is a list of tuples if at least 20 items of the |
| 43 | + first 100, or at least 30% of it, are maps with < 20 values." |
| 44 | + [coll] |
| 45 | + (and (instance? List coll) |
| 46 | + (let [[n ratio] (count-pred #(and (vector? %) (< (count %) 20)) 100 coll)] |
| 47 | + (or (> n 20) (> ratio 0.3))))) |
| 48 | + |
| 49 | +(defn- list-of-records? |
| 50 | + "Heuristic-based: a sequence is a list of 'records' if at least 20 items of the |
| 51 | + first 100, or at least 30% of it, are vectors with size < 20." |
| 52 | + [coll] |
| 53 | + (and (instance? List coll) |
| 54 | + (let [[n ratio] (count-pred #(and (map? %) (< (count %) 20)) 100 coll)] |
| 55 | + (or (> n 20) (> ratio 0.3))))) |
| 56 | + |
| 57 | +(defn- numbers-stats [^Iterable coll] |
| 58 | + (let [it (.iterator coll)] |
| 59 | + (loop [i 0, hi nil, lo nil, zeros 0, n 0, sum 0] |
| 60 | + (if (and (< i *size-cutoff*) (.hasNext it)) |
| 61 | + (let [x (.next it)] |
| 62 | + (if (number? x) |
| 63 | + (recur (inc i) |
| 64 | + (if (nil? hi) x (max hi x)) |
| 65 | + (if (nil? lo) x (min lo x)) |
| 66 | + (inc-if zeros (zero? x)) |
| 67 | + (inc n) |
| 68 | + (+ sum x)) |
| 69 | + (recur (inc i) hi lo zeros n sum))) |
| 70 | + (when (> n 0) |
| 71 | + {:n n, :zeros zeros, :max hi, :min lo, :mean (float (/ sum n))}))))) |
| 72 | + |
| 73 | +(def ^:private ^java.nio.charset.CharsetEncoder ascii-enc |
| 74 | + (.newEncoder (java.nio.charset.Charset/forName "US-ASCII"))) |
| 75 | + |
| 76 | +(defn- strings-stats [^Iterable coll] |
| 77 | + (let [it (.iterator coll)] |
| 78 | + (loop [i 0, n 0, blank 0, ascii 0, hi nil, lo nil, sum 0] |
| 79 | + (if (and (< i *size-cutoff*) (.hasNext it)) |
| 80 | + (let [x (.next it)] |
| 81 | + (if (string? x) |
| 82 | + (let [len (count x)] |
| 83 | + (recur (inc i) |
| 84 | + (inc n) |
| 85 | + (inc-if blank (str/blank? x)) |
| 86 | + (inc-if ascii (.canEncode ascii-enc ^String x)) |
| 87 | + (if (nil? hi) len (max hi len)) |
| 88 | + (if (nil? lo) len (min lo len)) |
| 89 | + (+ sum len))) |
| 90 | + (recur (inc i) n blank ascii hi lo sum))) |
| 91 | + (when (> n 0) |
| 92 | + {:n n, :blank blank, :ascii ascii, :max-len hi, :min-len lo, :avg-len (float (/ sum n))}))))) |
| 93 | + |
| 94 | +(defn- colls-stats [^Iterable coll] |
| 95 | + (let [it (.iterator coll)] |
| 96 | + (loop [i 0, n 0, empty 0, hi nil, lo nil, sum 0] |
| 97 | + (if (and (< i *size-cutoff*) (.hasNext it)) |
| 98 | + (let [x (.next it)] |
| 99 | + (if (instance? java.util.Collection x) |
| 100 | + (let [size (count x)] |
| 101 | + (recur (inc i) |
| 102 | + (inc n) |
| 103 | + (inc-if empty (empty? x)) |
| 104 | + (if (nil? hi) size (max hi size)) |
| 105 | + (if (nil? lo) size (min lo size)) |
| 106 | + (+ sum size))) |
| 107 | + (recur (inc i) n empty hi lo sum))) |
| 108 | + (when (> n 0) |
| 109 | + {:n n, :empty empty, :max-size hi, :min-size lo, :avg-size (float (/ sum n))}))))) |
| 110 | + |
| 111 | +(defn- basic-list-stats [coll show-count?] |
| 112 | + (when (instance? List coll) |
| 113 | + (let [cnt (bounded-count *size-cutoff* coll)] |
| 114 | + (non-nil-hmap |
| 115 | + :cutoff? (when (and show-count? (>= cnt *size-cutoff*)) true) |
| 116 | + :count (when show-count? cnt) |
| 117 | + :types (*frequencies (map type coll)) |
| 118 | + :frequencies (*frequencies coll) |
| 119 | + :numbers (numbers-stats coll) |
| 120 | + :strings (strings-stats coll) |
| 121 | + :collections (colls-stats coll))))) |
| 122 | + |
| 123 | +(defn- keyvals-stats [coll] |
| 124 | + (when (instance? Map coll) |
| 125 | + (let [cnt (bounded-count *size-cutoff* coll)] |
| 126 | + (when (> cnt 10) |
| 127 | + (non-nil-hmap |
| 128 | + :cutoff? (when (>= cnt *size-cutoff*) true) |
| 129 | + :count cnt |
| 130 | + :keys (basic-list-stats (vec (keys coll)) false) |
| 131 | + :values (basic-list-stats (vec (vals coll)) false)))))) |
| 132 | + |
| 133 | +(defn- tuples-stats [^Iterable coll] |
| 134 | + (when (list-of-tuples? coll) |
| 135 | + (let [cnt (bounded-count *size-cutoff* coll) |
| 136 | + all (into [] (take *size-cutoff*) coll) |
| 137 | + longest (min 20 (apply max (map count all)))] |
| 138 | + (non-nil-hmap |
| 139 | + :cutoff? (when (>= cnt *size-cutoff*) true) |
| 140 | + :count cnt |
| 141 | + :tuples (mapv (fn [i] |
| 142 | + (basic-list-stats |
| 143 | + (mapv #(when (vector? %) (nth % i nil)) all) |
| 144 | + false)) |
| 145 | + (range longest)))))) |
| 146 | + |
| 147 | +(defn- records-stats [^Iterable coll] |
| 148 | + (when (list-of-records? coll) |
| 149 | + (let [cnt (bounded-count *size-cutoff* coll) |
| 150 | + ks (set (mapcat keys coll))] |
| 151 | + (non-nil-hmap |
| 152 | + :cutoff? (when (>= cnt *size-cutoff*) true) |
| 153 | + :count cnt |
| 154 | + :by-key (into {} |
| 155 | + (for [k ks] |
| 156 | + (let [kcoll (mapv #(get % k) coll)] |
| 157 | + [k (basic-list-stats kcoll false)]))))))) |
| 158 | + |
| 159 | +(defn analytics |
| 160 | + "Return various analytical data about `object`. Supports the following data |
| 161 | + types with different amount of insights: |
| 162 | + - lists of numbers |
| 163 | + - lists of strings |
| 164 | + - lists of tuples |
| 165 | + - lists of 'records' (maps with same keys) |
| 166 | + - lists of arbitrary collections |
| 167 | + - arbitrary key-value maps" |
| 168 | + [object] |
| 169 | + (or (tuples-stats object) |
| 170 | + (records-stats object) |
| 171 | + (keyvals-stats object) |
| 172 | + (basic-list-stats object true))) |
| 173 | + |
| 174 | +(defn can-analyze? |
| 175 | + "Simple heuristic: we currently only analyze collections (but most of them)." |
| 176 | + [object] |
| 177 | + (instance? java.util.Collection object)) |
0 commit comments