From b748e9f00a5efb39531a60f2996a5084c8abe3af Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Thu, 8 Aug 2024 11:21:46 +0200 Subject: [PATCH 1/7] Added mem-dbg as optional feature --- Cargo.toml | 1 + src/count_min.rs | 1 + src/distinct.rs | 2 ++ src/linked_list.rs | 3 +++ src/ordered_linked_list.rs | 2 ++ src/sample.rs | 3 +++ src/top.rs | 3 +++ 7 files changed, 15 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index d076f2f..7778f63 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,3 +26,4 @@ twox-hash = "1.1" serde = { version = "1.0", features = ["derive"] } rand = { version = "0.7", features = ["small_rng"] } packed_simd = { version = "0.3", features = ["into_bits"], optional = true } +mem_dbg = {version="0.2.2", optional=true} diff --git a/src/count_min.rs b/src/count_min.rs index 6a45e39..439d9ca 100644 --- a/src/count_min.rs +++ b/src/count_min.rs @@ -39,6 +39,7 @@ use crate::traits::{Intersect, IntersectPlusUnionIsPlus, New, UnionAssign}; serialize = "C: Serialize, ::Config: Serialize", deserialize = "C: Deserialize<'de>, ::Config: Deserialize<'de>" ))] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct CountMinSketch { counters: Vec>, offsets: Vec, // to avoid malloc/free each push diff --git a/src/distinct.rs b/src/distinct.rs index 1fad41c..8e4b4a4 100644 --- a/src/distinct.rs +++ b/src/distinct.rs @@ -59,6 +59,7 @@ use self::consts::{BIAS_DATA, RAW_ESTIMATE_DATA, TRESHOLD_DATA}; /// Like [`HyperLogLog`] but implements `Ord` and `Eq` by using the estimate of the cardinality. #[derive(Serialize, Deserialize)] #[serde(bound = "")] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct HyperLogLogMagnitude(HyperLogLog); impl Ord for HyperLogLogMagnitude { #[inline(always)] @@ -127,6 +128,7 @@ impl IntersectPlusUnionIsPlus for HyperLogLogMagnitude { /// See [*HyperLogLog: the analysis of a near-optimal cardinality estimation algorithm*](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf) and [*HyperLogLog in Practice: Algorithmic Engineering of a State of The Art Cardinality Estimation Algorithm*](https://ai.google/research/pubs/pub40671) for background on HyperLogLog with bias correction. #[derive(Serialize, Deserialize)] #[serde(bound = "")] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct HyperLogLog { alpha: f64, zero: usize, diff --git a/src/linked_list.rs b/src/linked_list.rs index 527391a..e75f4c8 100644 --- a/src/linked_list.rs +++ b/src/linked_list.rs @@ -2,6 +2,7 @@ use serde::{Deserialize, Serialize}; use std::{iter, marker, ops}; #[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct LinkedListIndex<'a>(usize, marker::PhantomData<&'a ()>); impl<'a> LinkedListIndex<'a> { #[inline(always)] @@ -11,6 +12,7 @@ impl<'a> LinkedListIndex<'a> { } #[derive(Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct LinkedList { vec: Box<[(usize, usize, Option)]>, head: usize, @@ -304,6 +306,7 @@ impl<'a, T> ops::IndexMut> for LinkedList { } } + pub struct LinkedListIter<'a, T: 'a> { linked_list: &'a LinkedList, index: Option>, diff --git a/src/ordered_linked_list.rs b/src/ordered_linked_list.rs index 771cf2e..1cb7551 100644 --- a/src/ordered_linked_list.rs +++ b/src/ordered_linked_list.rs @@ -4,6 +4,7 @@ use std::{ops, ptr}; use crate::linked_list::{LinkedList, LinkedListIndex}; #[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct OrderedLinkedListIndex<'a>(LinkedListIndex<'a>); impl<'a> OrderedLinkedListIndex<'a> { #[inline(always)] @@ -13,6 +14,7 @@ impl<'a> OrderedLinkedListIndex<'a> { } #[derive(Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct OrderedLinkedList(LinkedList); impl OrderedLinkedList { pub fn new(cap: usize) -> Self { diff --git a/src/sample.rs b/src/sample.rs index fd63d88..75e307a 100644 --- a/src/sample.rs +++ b/src/sample.rs @@ -4,6 +4,7 @@ use std::{convert::TryFrom, fmt, iter, ops, vec}; /// Given population and sample sizes, returns true if this element is in the sample. Without replacement. #[derive(Clone, Debug, Serialize, Deserialize)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct SampleTotal { total: usize, samples: usize, @@ -39,6 +40,7 @@ impl Drop for SampleTotal { } #[derive(Clone)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] struct FixedCapVec(Vec); impl FixedCapVec { fn new(cap: usize) -> Self { @@ -122,6 +124,7 @@ where /// [Reservoir sampling](https://en.wikipedia.org/wiki/Reservoir_sampling). Without replacement, and the returned order is unstable. #[derive(Clone, Debug, Serialize, Deserialize)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct SampleUnstable { reservoir: FixedCapVec, i: usize, diff --git a/src/top.rs b/src/top.rs index feb549f..141a06f 100644 --- a/src/top.rs +++ b/src/top.rs @@ -37,6 +37,7 @@ use crate::{ serialize = "A: Hash + Eq + Serialize, C: Serialize, ::Config: Serialize", deserialize = "A: Hash + Eq + Deserialize<'de>, C: Deserialize<'de>, ::Config: Deserialize<'de>" ))] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] pub struct Top { map: HashMap, RandomXxHashBuilder>, list: OrderedLinkedList>, @@ -227,6 +228,7 @@ impl< } #[derive(Clone, Serialize, Deserialize)] +#[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] struct Node(T, C); impl Ord for Node { #[inline(always)] @@ -280,6 +282,7 @@ mod test { #[derive(Serialize, Deserialize)] #[serde(bound = "")] + #[cfg_attr(feature = "mem_dbg", derive(mem_dbg::MemDbg, mem_dbg::MemSize))] struct HLL(HyperLogLog); impl Ord for HLL { #[inline(always)] From f9cb562f619f86e75bf6a09eb720b8b6c1db8fde Mon Sep 17 00:00:00 2001 From: Luca Cappelletti Date: Fri, 9 Aug 2024 14:33:31 +0200 Subject: [PATCH 2/7] Bumped mem-dbg --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7778f63..844d310 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,4 +26,4 @@ twox-hash = "1.1" serde = { version = "1.0", features = ["derive"] } rand = { version = "0.7", features = ["small_rng"] } packed_simd = { version = "0.3", features = ["into_bits"], optional = true } -mem_dbg = {version="0.2.2", optional=true} +mem_dbg = {version="0.2.4", optional=true} From fd46c150c9fe54fa4e7510290d5a9b4126f1c530 Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Sun, 11 Aug 2024 08:52:44 +0200 Subject: [PATCH 3/7] Increased assert limit to 18 --- src/distinct.rs | 2 +- src/distinct/consts.rs | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/distinct.rs b/src/distinct.rs index 8e4b4a4..4f1c60f 100644 --- a/src/distinct.rs +++ b/src/distinct.rs @@ -339,7 +339,7 @@ where } fn get_alpha(p: u8) -> f64 { - assert!(4 <= p && p <= 16); + assert!(4 <= p && p <= 18); match p { 4 => 0.673, 5 => 0.697, diff --git a/src/distinct/consts.rs b/src/distinct/consts.rs index 752e2b7..2aa8b09 100644 --- a/src/distinct/consts.rs +++ b/src/distinct/consts.rs @@ -42,6 +42,16 @@ mod test { ret }) } + + #[test] + fn test_length_compatability() { + assert_eq!(TRESHOLD_DATA.len(), RAW_ESTIMATE_DATA.len()); + assert_eq!(RAW_ESTIMATE_DATA.len(), BIAS_DATA.len()); + + for (raw_estimate_data, bias_data) in RAW_ESTIMATE_DATA.iter().zip(BIAS_DATA.iter()) { + assert_eq!(raw_estimate_data.len(), bias_data.len()); + } + } } #[rustfmt::skip] From 9776c98dba2d4264db6cd73dd040805556055894 Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Tue, 13 Aug 2024 12:00:36 +0200 Subject: [PATCH 4/7] Exposed harmonic sum method --- src/distinct.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/distinct.rs b/src/distinct.rs index 4f1c60f..cae00eb 100644 --- a/src/distinct.rs +++ b/src/distinct.rs @@ -170,6 +170,14 @@ where } } + /// Returns the current harmonic sum of the `HyperLogLog` data structure. + /// + /// The harmonic sum is the sum of the reciprocals of the registers, i.e. + /// `1/2^m[0] + 1/2^m[1] + ... + 1/2^m[n-1]`. + pub fn harmonic_sum(&self) -> f64 { + self.sum + } + /// "Visit" an element. #[inline] pub fn push(&mut self, value: &V) { From 5eb10f9b49687f439b0537b3ff28056f260c49e9 Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Tue, 13 Aug 2024 12:03:42 +0200 Subject: [PATCH 5/7] Exposed zero registers counter --- src/distinct.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/distinct.rs b/src/distinct.rs index cae00eb..81d6e67 100644 --- a/src/distinct.rs +++ b/src/distinct.rs @@ -178,6 +178,11 @@ where self.sum } + /// Returns the current number of zero registers in the `HyperLogLog` data structure. + pub fn zero_registers(&self) -> usize { + self.zero + } + /// "Visit" an element. #[inline] pub fn push(&mut self, value: &V) { From 08d0906b5e4fe61ea4a149af5fec7dc2ca441708 Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Tue, 13 Aug 2024 12:17:59 +0200 Subject: [PATCH 6/7] Exposed parameter for the number of registers --- src/distinct.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/distinct.rs b/src/distinct.rs index 81d6e67..a501ada 100644 --- a/src/distinct.rs +++ b/src/distinct.rs @@ -183,6 +183,11 @@ where self.zero } + /// Returns the number of registers in the `HyperLogLog` data structure. + pub fn number_of_registers(&self) -> usize { + self.m.len() + } + /// "Visit" an element. #[inline] pub fn push(&mut self, value: &V) { From 3a18c45819db01353d8f7badeb8da7d6a0db60e2 Mon Sep 17 00:00:00 2001 From: LucaCappelletti94 Date: Wed, 14 Aug 2024 23:14:18 +0200 Subject: [PATCH 7/7] Exposed precision --- src/distinct.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/distinct.rs b/src/distinct.rs index a501ada..29c7b03 100644 --- a/src/distinct.rs +++ b/src/distinct.rs @@ -188,6 +188,11 @@ where self.m.len() } + /// Returns the precision of the `HyperLogLog` data structure. + pub fn precision(&self) -> u8 { + self.p + } + /// "Visit" an element. #[inline] pub fn push(&mut self, value: &V) {