diff --git a/Cargo.toml b/Cargo.toml index d076f2f..844d310 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,3 +26,4 @@ twox-hash = "1.1" serde = { version = "1.0", features = ["derive"] } rand = { version = "0.7", features = ["small_rng"] } packed_simd = { version = "0.3", features = ["into_bits"], optional = true } +mem_dbg = {version="0.2.4", optional=true} diff --git a/src/count_min.rs b/src/count_min.rs index 6a45e39..439d9ca 100644 --- a/src/count_min.rs +++ b/src/count_min.rs @@ -39,6 +39,7 @@ use crate::traits::{Intersect, IntersectPlusUnionIsPlus, New, UnionAssign}; serialize = "C: Serialize, ::Config: Serialize", deserialize = "C: Deserialize<'de>, ::Config: Deserialize<'de>" ))] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct CountMinSketch { counters: Vec>, offsets: Vec, // to avoid malloc/free each push diff --git a/src/distinct.rs b/src/distinct.rs index 1fad41c..29c7b03 100644 --- a/src/distinct.rs +++ b/src/distinct.rs @@ -59,6 +59,7 @@ use self::consts::{BIAS_DATA, RAW_ESTIMATE_DATA, TRESHOLD_DATA}; /// Like [`HyperLogLog`] but implements `Ord` and `Eq` by using the estimate of the cardinality. #[derive(Serialize, Deserialize)] #[serde(bound = "")] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct HyperLogLogMagnitude(HyperLogLog); impl Ord for HyperLogLogMagnitude { #[inline(always)] @@ -127,6 +128,7 @@ impl IntersectPlusUnionIsPlus for HyperLogLogMagnitude { /// See [*HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm*](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf) and [*HyperLogLog in Practice: Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm*](https://ai.google/research/pubs/pub40671) for background on HyperLogLog with bias correction. #[derive(Serialize, Deserialize)] #[serde(bound = "")] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct HyperLogLog { alpha: f64, zero: usize, @@ -168,6 +170,29 @@ where } } + /// Returns the current harmonic sum of the `HyperLogLog` data structure. + /// + /// The harmonic sum is the sum of the reciprocals of the registers, i.e. + /// `1/2^m[0] + 1/2^m[1] + ... + 1/2^m[n-1]`. + pub fn harmonic_sum(&self) -> f64 { + self.sum + } + + /// Returns the current number of zero registers in the `HyperLogLog` data structure. + pub fn zero_registers(&self) -> usize { + self.zero + } + + /// Returns the number of registers in the `HyperLogLog` data structure. + pub fn number_of_registers(&self) -> usize { + self.m.len() + } + + /// Returns the precision of the `HyperLogLog` data structure. + pub fn precision(&self) -> u8 { + self.p + } + /// "Visit" an element. #[inline] pub fn push(&mut self, value: &V) { @@ -337,7 +362,7 @@ where } fn get_alpha(p: u8) -> f64 { - assert!(4 <= p && p <= 16); + assert!(4 <= p && p <= 18); match p { 4 => 0.673, 5 => 0.697, diff --git a/src/distinct/consts.rs b/src/distinct/consts.rs index 752e2b7..2aa8b09 100644 --- a/src/distinct/consts.rs +++ b/src/distinct/consts.rs @@ -42,6 +42,16 @@ mod test { ret }) } + + #[test] + fn test_length_compatability() { + assert_eq!(TRESHOLD_DATA.len(), RAW_ESTIMATE_DATA.len()); + assert_eq!(RAW_ESTIMATE_DATA.len(), BIAS_DATA.len()); + + for (raw_estimate_data, bias_data) in RAW_ESTIMATE_DATA.iter().zip(BIAS_DATA.iter()) { + assert_eq!(raw_estimate_data.len(), bias_data.len()); + } + } } #[rustfmt::skip] diff --git a/src/linked_list.rs b/src/linked_list.rs index 527391a..e75f4c8 100644 --- a/src/linked_list.rs +++ b/src/linked_list.rs @@ -2,6 +2,7 @@ use serde::{Deserialize, Serialize}; use std::{iter, marker, ops}; #[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct LinkedListIndex<'a>(usize, marker::PhantomData<&'a ()>); impl<'a> LinkedListIndex<'a> { #[inline(always)] @@ -11,6 +12,7 @@ impl<'a> LinkedListIndex<'a> { } #[derive(Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct LinkedList { vec: Box<[(usize, usize, Option)]>, head: usize, @@ -304,6 +306,7 @@ impl<'a, T> ops::IndexMut> for LinkedList { } } + pub struct LinkedListIter<'a, T: 'a> { linked_list: &'a LinkedList, index: Option>, diff --git a/src/ordered_linked_list.rs b/src/ordered_linked_list.rs index 771cf2e..1cb7551 100644 --- a/src/ordered_linked_list.rs +++ b/src/ordered_linked_list.rs @@ -4,6 +4,7 @@ use std::{ops, ptr}; use crate::linked_list::{LinkedList, LinkedListIndex}; #[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct OrderedLinkedListIndex<'a>(LinkedListIndex<'a>); impl<'a> OrderedLinkedListIndex<'a> { #[inline(always)] @@ -13,6 +14,7 @@ impl<'a> OrderedLinkedListIndex<'a> { } #[derive(Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct OrderedLinkedList(LinkedList); impl OrderedLinkedList { pub fn new(cap: usize) -> Self { diff --git a/src/sample.rs b/src/sample.rs index fd63d88..75e307a 100644 --- a/src/sample.rs +++ b/src/sample.rs @@ -4,6 +4,7 @@ use std::{convert::TryFrom, fmt, iter, ops, vec}; /// Given population and sample sizes, returns true if this element is in the sample. Without replacement. #[derive(Clone, Debug, Serialize, Deserialize)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct SampleTotal { total: usize, samples: usize, @@ -39,6 +40,7 @@ impl Drop for SampleTotal { } #[derive(Clone)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] struct FixedCapVec(Vec); impl FixedCapVec { fn new(cap: usize) -> Self { @@ -122,6 +124,7 @@ where /// [Reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling). Without replacement, and the returned order is unstable. #[derive(Clone, Debug, Serialize, Deserialize)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct SampleUnstable { reservoir: FixedCapVec, i: usize, diff --git a/src/top.rs b/src/top.rs index feb549f..141a06f 100644 --- a/src/top.rs +++ b/src/top.rs @@ -37,6 +37,7 @@ use crate::{ serialize = "A: Hash + Eq + Serialize, C: Serialize, ::Config: Serialize", deserialize = "A: Hash + Eq + Deserialize<'de>, C: Deserialize<'de>, ::Config: Deserialize<'de>" ))] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct Top { map: HashMap, RandomXxHashBuilder>, list: OrderedLinkedList>, @@ -227,6 +228,7 @@ impl< } #[derive(Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] struct Node(T, C); impl Ord for Node { #[inline(always)] @@ -280,6 +282,7 @@ mod test { #[derive(Serialize, Deserialize)] #[serde(bound = "")] + #[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] struct HLL(HyperLogLog); impl Ord for HLL { #[inline(always)]