From c521d4304f19988b0907922a28081ee981cd450e Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Tue, 12 Aug 2025 23:20:41 -0700 Subject: [PATCH 1/3] improve object store related metrics --- src/metrics/storage.rs | 131 +++---- src/storage/azure_blob.rs | 599 +++++++++++++++++++++++++----- src/storage/gcs.rs | 592 +++++++++++++++++++++++++----- src/storage/localfs.rs | 474 +++++++++++++++++++++--- src/storage/metrics_layer.rs | 255 +++++++++---- src/storage/s3.rs | 679 +++++++++++++++++++++++++++++------ 6 files changed, 2237 insertions(+), 493 deletions(-) diff --git a/src/metrics/storage.rs b/src/metrics/storage.rs index f96a317d9..d483d0ee8 100644 --- a/src/metrics/storage.rs +++ b/src/metrics/storage.rs @@ -16,150 +16,109 @@ * */ +use crate::metrics::METRICS_NAMESPACE; use actix_web_prometheus::PrometheusMetrics; +use once_cell::sync::Lazy; +use prometheus::{CounterVec, HistogramOpts, HistogramVec, Opts}; + +// Global storage metric used by all storage providers +pub static STORAGE_REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { + HistogramVec::new( + HistogramOpts::new("storage_request_response_time", "Storage Request Latency") + .namespace(METRICS_NAMESPACE), + &["provider", "method", "status"], + ) + .expect("metric can be created") +}); + +// Global storage metric for tracking number of files scanned +pub static STORAGE_FILES_SCANNED: Lazy = Lazy::new(|| { + CounterVec::new( + Opts::new( + "storage_files_scanned_total", + "Total number of files scanned in storage operations", + ) + .namespace(METRICS_NAMESPACE), + &["provider", "operation"], + ) + .expect("metric can be created") +}); pub trait StorageMetrics { fn register_metrics(&self, handler: &PrometheusMetrics); } pub mod localfs { - use crate::{metrics::METRICS_NAMESPACE, storage::FSConfig}; - use once_cell::sync::Lazy; - use prometheus::{HistogramOpts, HistogramVec}; - - use super::StorageMetrics; + use crate::storage::FSConfig; - pub static REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("local_fs_response_time", "FileSystem Request Latency") - .namespace(METRICS_NAMESPACE), - &["method", "status"], - ) - .expect("metric can be created") - }); + use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; impl StorageMetrics for FSConfig { fn register_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { handler .registry - .register(Box::new(REQUEST_RESPONSE_TIME.clone())) + .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) + .expect("metric can be registered"); + handler + .registry + .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); } } } pub mod s3 { - use crate::{metrics::METRICS_NAMESPACE, storage::S3Config}; - use once_cell::sync::Lazy; - use prometheus::{HistogramOpts, HistogramVec}; + use crate::storage::S3Config; - use super::StorageMetrics; - - pub static REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("s3_response_time", "S3 Request Latency") - .namespace(METRICS_NAMESPACE), - &["method", "status"], - ) - .expect("metric can be created") - }); - - pub static QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("query_s3_response_time", "S3 Request Latency") - .namespace(METRICS_NAMESPACE), - &["method", "status"], - ) - .expect("metric can be created") - }); + use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; impl StorageMetrics for S3Config { fn register_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { handler .registry - .register(Box::new(REQUEST_RESPONSE_TIME.clone())) + .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) .expect("metric can be registered"); handler .registry - .register(Box::new(QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME.clone())) + .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); } } } pub mod azureblob { - use crate::{metrics::METRICS_NAMESPACE, storage::AzureBlobConfig}; - use once_cell::sync::Lazy; - use prometheus::{HistogramOpts, HistogramVec}; - - use super::StorageMetrics; + use crate::storage::AzureBlobConfig; - pub static REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("azr_blob_response_time", "AzureBlob Request Latency") - .namespace(METRICS_NAMESPACE), - &["method", "status"], - ) - .expect("metric can be created") - }); - - pub static QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("query_azr_blob_response_time", "AzureBlob Request Latency") - .namespace(METRICS_NAMESPACE), - &["method", "status"], - ) - .expect("metric can be created") - }); + use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; impl StorageMetrics for AzureBlobConfig { fn register_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { handler .registry - .register(Box::new(REQUEST_RESPONSE_TIME.clone())) + .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) .expect("metric can be registered"); handler .registry - .register(Box::new(QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME.clone())) + .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); } } } pub mod gcs { - use crate::{metrics::METRICS_NAMESPACE, storage::GcsConfig}; - use once_cell::sync::Lazy; - use prometheus::{HistogramOpts, HistogramVec}; + use crate::storage::GcsConfig; - use super::StorageMetrics; - - pub static REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("gcs_response_time", "GCS Request Latency") - .namespace(METRICS_NAMESPACE), - &["method", "status"], - ) - .expect("metric can be created") - }); - - pub static QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME: Lazy = Lazy::new(|| { - HistogramVec::new( - HistogramOpts::new("query_gcs_response_time", "GCS Request Latency") - .namespace(METRICS_NAMESPACE), - &["method", "status"], - ) - .expect("metric can be created") - }); + use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; impl StorageMetrics for GcsConfig { fn register_metrics(&self, handler: &actix_web_prometheus::PrometheusMetrics) { handler .registry - .register(Box::new(REQUEST_RESPONSE_TIME.clone())) + .register(Box::new(STORAGE_REQUEST_RESPONSE_TIME.clone())) .expect("metric can be registered"); handler .registry - .register(Box::new(QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME.clone())) + .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); } } diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 1c6cf300b..fcf090126 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -47,7 +47,7 @@ use url::Url; use crate::{ handlers::http::users::USERS_ROOT_DIR, - metrics::storage::{StorageMetrics, azureblob::REQUEST_RESPONSE_TIME}, + metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, parseable::LogStream, }; @@ -55,7 +55,7 @@ use super::{ CONNECT_TIMEOUT_SECS, MIN_MULTIPART_UPLOAD_SIZE, ObjectStorage, ObjectStorageError, ObjectStorageProvider, PARSEABLE_ROOT_DIRECTORY, REQUEST_TIMEOUT_SECS, SCHEMA_FILE_NAME, STREAM_METADATA_FILE_NAME, STREAM_ROOT_DIRECTORY, metrics_layer::MetricLayer, - object_storage::parseable_json_path, to_object_store_path, + metrics_layer::error_to_status_code, object_storage::parseable_json_path, to_object_store_path, }; #[derive(Debug, Clone, clap::Args)] @@ -167,7 +167,7 @@ impl ObjectStorageProvider for AzureBlobConfig { let azure = self.get_default_builder().build().unwrap(); // limit objectstore to a concurrent request limit let azure = LimitStore::new(azure, super::MAX_OBJECT_STORE_REQUESTS); - let azure = MetricLayer::new(azure); + let azure = MetricLayer::new(azure, "azure_blob"); let object_store_registry = DefaultObjectStoreRegistry::new(); let url = ObjectStoreUrl::parse(format!("https://{}.blob.core.windows.net", self.account)) @@ -212,21 +212,21 @@ impl BlobStore { async fn _get_object(&self, path: &RelativePath) -> Result { let instant = Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; + let elapsed = instant.elapsed().as_secs_f64(); match resp { Ok(resp) => { - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); let body = resp.bytes().await.unwrap(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "GET", "200"]) + .observe(elapsed); Ok(body) } Err(err) => { - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "400"]) - .observe(time); + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "GET", status_code]) + .observe(elapsed); Err(err.into()) } } @@ -237,36 +237,66 @@ impl BlobStore { path: &RelativePath, resource: PutPayload, ) -> Result<(), ObjectStorageError> { - let time = Instant::now(); + let instant = Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let status = if resp.is_ok() { "200" } else { "400" }; - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["PUT", status]) - .observe(time); - - if let Err(object_store::Error::NotFound { source, .. }) = &resp { - return Err(ObjectStorageError::Custom( - format!("Failed to upload, error: {source:?}").to_string(), - )); + let elapsed = instant.elapsed().as_secs_f64(); + match resp { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", "200"]) + .observe(elapsed); + Ok(()) + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", status_code]) + .observe(elapsed); + + if let object_store::Error::NotFound { source, .. } = &err { + return Err(ObjectStorageError::Custom( + format!("Failed to upload, error: {source:?}").to_string(), + )); + } + Err(err.into()) + } } - - resp.map(|_| ()).map_err(|err| err.into()) } async fn _delete_prefix(&self, key: &str) -> Result<(), ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); object_stream .for_each_concurrent(None, |x| async { match x { Ok(obj) => { - if (self.client.delete(&obj.location).await).is_err() { - error!("Failed to fetch object during delete stream"); + // Track individual DELETE operation + let delete_start = Instant::now(); + match self.client.delete(&obj.location).await { + Ok(_) => { + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "DELETE", "200"]) + .observe(delete_elapsed); + } + Err(err) => { + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "DELETE", status_code]) + .observe(delete_elapsed); + error!("Failed to delete object during delete stream: {:?}", err); + } } } - Err(_) => { - error!("Failed to fetch object during delete stream"); + Err(err) => { + error!("Failed to fetch object during delete stream: {:?}", err); } }; }) @@ -277,7 +307,27 @@ impl BlobStore { async fn _list_streams(&self) -> Result, ObjectStorageError> { let mut result_file_list = HashSet::new(); - let resp = self.client.list_with_delimiter(None).await?; + + // Track initial LIST operation + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(None).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; let streams = resp .common_prefixes @@ -290,13 +340,33 @@ impl BlobStore { for stream in streams { let stream_path = object_store::path::Path::from(format!("{}/{}", &stream, STREAM_ROOT_DIRECTORY)); - let resp = self.client.list_with_delimiter(Some(&stream_path)).await?; - if resp - .objects - .iter() - .any(|name| name.location.filename().unwrap().ends_with("stream.json")) - { - result_file_list.insert(stream); + + // Track individual LIST operations for each stream + let stream_list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&stream_path)).await; + let stream_list_elapsed = stream_list_start.elapsed().as_secs_f64(); + + match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(stream_list_elapsed); + + if resp + .objects + .iter() + .any(|name| name.location.filename().unwrap().ends_with("stream.json")) + { + result_file_list.insert(stream); + } + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(stream_list_elapsed); + return Err(err.into()); + } } } @@ -304,10 +374,29 @@ impl BlobStore { } async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let resp = self .client .list_with_delimiter(Some(&(stream.into()))) - .await?; + .await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; let common_prefixes = resp.common_prefixes; @@ -326,10 +415,30 @@ impl BlobStore { stream: &str, ) -> Result>, ObjectStorageError> { let mut result_file_list: BTreeMap> = BTreeMap::new(); + + // Track initial LIST operation + let list_start = Instant::now(); let resp = self .client .list_with_delimiter(Some(&(stream.into()))) - .await?; + .await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; let dates = resp .common_prefixes @@ -338,45 +447,74 @@ impl BlobStore { .filter(|name| name.as_ref() != stream && name.as_ref() != STREAM_ROOT_DIRECTORY) .map(|name| name.as_ref().to_string()) .collect::>(); + for date in dates { let date_path = object_store::path::Path::from(format!("{}/{}", stream, &date)); - let resp = self.client.list_with_delimiter(Some(&date_path)).await?; - let manifests: Vec = resp - .objects - .iter() - .filter(|name| name.location.filename().unwrap().ends_with("manifest.json")) - .map(|name| name.location.to_string()) - .collect(); - result_file_list.entry(date).or_default().extend(manifests); + + // Track individual LIST operation for each date + let date_list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&date_path)).await; + let date_list_elapsed = date_list_start.elapsed().as_secs_f64(); + + match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(date_list_elapsed); + + let manifests: Vec = resp + .objects + .iter() + .filter(|name| name.location.filename().unwrap().ends_with("manifest.json")) + .map(|name| name.location.to_string()) + .collect(); + result_file_list.entry(date).or_default().extend(manifests); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(date_list_elapsed); + return Err(err.into()); + } + } } Ok(result_file_list) } async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - let instant = Instant::now(); - // // TODO: Uncomment this when multipart is fixed // let should_multipart = std::fs::metadata(path)?.len() > MULTIPART_UPLOAD_SIZE as u64; let should_multipart = false; - let res = if should_multipart { + if should_multipart { // self._upload_multipart(key, path).await // this branch will never get executed Ok(()) } else { let bytes = tokio::fs::read(path).await?; - let result = self.client.put(&key.into(), bytes.into()).await?; - info!("Uploaded file to Azure Blob Storage: {:?}", result); - Ok(()) - }; - let status = if res.is_ok() { "200" } else { "400" }; - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["UPLOAD_PARQUET", status]) - .observe(time); - - res + let put_start = Instant::now(); + let result = self.client.put(&key.into(), bytes.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + + match result { + Ok(result) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", "200"]) + .observe(put_elapsed); + info!("Uploaded file to Azure Blob Storage: {:?}", result); + Ok(()) + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", status_code]) + .observe(put_elapsed); + Err(err.into()) + } + } + } } async fn _upload_multipart( @@ -387,14 +525,52 @@ impl BlobStore { let mut file = OpenOptions::new().read(true).open(path).await?; let location = &to_object_store_path(key); - let mut async_writer = self.client.put_multipart(location).await?; + // Track multipart initiation + let multipart_start = Instant::now(); + let async_writer = self.client.put_multipart(location).await; + let multipart_elapsed = multipart_start.elapsed().as_secs_f64(); + + let mut async_writer = match async_writer { + Ok(writer) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT_MULTIPART_INIT", "200"]) + .observe(multipart_elapsed); + writer + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT_MULTIPART_INIT", status_code]) + .observe(multipart_elapsed); + return Err(err.into()); + } + }; let meta = file.metadata().await?; let total_size = meta.len() as usize; if total_size < MIN_MULTIPART_UPLOAD_SIZE { let mut data = Vec::new(); file.read_to_end(&mut data).await?; - self.client.put(location, data.into()).await?; + + // Track single PUT operation for small files + let put_start = Instant::now(); + let result = self.client.put(location, data.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", "200"]) + .observe(put_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", status_code]) + .observe(put_elapsed); + return Err(err.into()); + } + } // async_writer.put_part(data.into()).await?; // async_writer.complete().await?; return Ok(()); @@ -408,7 +584,7 @@ impl BlobStore { let num_full_parts = total_size / MIN_MULTIPART_UPLOAD_SIZE; let total_parts = num_full_parts + if has_final_partial_part { 1 } else { 0 }; - // Upload each part + // Upload each part with metrics for part_number in 0..(total_parts) { let start_pos = part_number * MIN_MULTIPART_UPLOAD_SIZE; let end_pos = if part_number == num_full_parts && has_final_partial_part { @@ -422,15 +598,47 @@ impl BlobStore { // Extract this part's data let part_data = data[start_pos..end_pos].to_vec(); - // Upload the part - async_writer.put_part(part_data.into()).await?; + // Track individual part upload + let part_start = Instant::now(); + let result = async_writer.put_part(part_data.into()).await; + let part_elapsed = part_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT_MULTIPART_PART", "200"]) + .observe(part_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT_MULTIPART_PART", status_code]) + .observe(part_elapsed); + return Err(err.into()); + } + } // upload_parts.push(part_number as u64 + 1); } - if let Err(err) = async_writer.complete().await { + + // Track multipart completion + let complete_start = Instant::now(); + let complete_result = async_writer.complete().await; + let complete_elapsed = complete_start.elapsed().as_secs_f64(); + + if let Err(err) = complete_result { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT_MULTIPART_COMPLETE", status_code]) + .observe(complete_elapsed); error!("Failed to complete multipart upload. {:?}", err); async_writer.abort().await?; - }; + return Err(err.into()); + } else { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT_MULTIPART_COMPLETE", "200"]) + .observe(complete_elapsed); + } } Ok(()) } @@ -500,6 +708,11 @@ impl ObjectStorage for BlobStore { ))) } async fn head(&self, _path: &RelativePath) -> Result { + // Record attempt to access file (even though operation not implemented) + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "HEAD"]) + .inc(); + Err(ObjectStorageError::UnhandledError(Box::new( std::io::Error::new( std::io::ErrorKind::Unsupported, @@ -509,7 +722,14 @@ impl ObjectStorage for BlobStore { } async fn get_object(&self, path: &RelativePath) -> Result { - Ok(self._get_object(path).await?) + let result = self._get_object(path).await?; + + // Record single file accessed + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "GET"]) + .inc(); + + Ok(result) } async fn get_objects( @@ -517,19 +737,39 @@ impl ObjectStorage for BlobStore { base_path: Option<&RelativePath>, filter_func: Box bool + Send>, ) -> Result, ObjectStorageError> { - let instant = Instant::now(); - let prefix = if let Some(base_path) = base_path { to_object_store_path(base_path) } else { self.root.clone() }; + // Track list operation + let list_start = Instant::now(); let mut list_stream = self.client.list(Some(&prefix)); let mut res = vec![]; + let mut files_scanned = 0; - while let Some(meta) = list_stream.next().await.transpose()? { + while let Some(meta_result) = list_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + files_scanned += 1; let ingestor_file = filter_func(meta.location.filename().unwrap().to_string()); if !ingestor_file { @@ -546,10 +786,10 @@ impl ObjectStorage for BlobStore { res.push(byts); } - let instant = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(instant); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "GET"]) + .inc_by(files_scanned as f64); Ok(res) } @@ -557,11 +797,33 @@ impl ObjectStorage for BlobStore { async fn get_ingestor_meta_file_paths( &self, ) -> Result, ObjectStorageError> { - let time = Instant::now(); let mut path_arr = vec![]; + let mut files_scanned = 0; + + // Track list operation + let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&self.root)); - while let Some(meta) = object_stream.next().await.transpose()? { + while let Some(meta_result) = object_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + files_scanned += 1; let flag = meta.location.filename().unwrap().starts_with("ingestor"); if flag { @@ -569,10 +831,10 @@ impl ObjectStorage for BlobStore { } } - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -581,12 +843,34 @@ impl ObjectStorage for BlobStore { &self, stream_name: &str, ) -> Result, ObjectStorageError> { - let time = Instant::now(); let mut path_arr = vec![]; + let mut files_scanned = 0; let path = to_object_store_path(&RelativePathBuf::from(stream_name)); + + // Track list operation + let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&path)); - while let Some(meta) = object_stream.next().await.transpose()? { + while let Some(meta_result) = object_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + files_scanned += 1; let flag = meta.location.filename().unwrap().starts_with(".ingestor"); if flag { @@ -600,10 +884,10 @@ impl ObjectStorage for BlobStore { ])); path_arr.push(RelativePathBuf::from_iter([stream_name, SCHEMA_FILE_NAME])); - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -617,6 +901,11 @@ impl ObjectStorage for BlobStore { .await .map_err(|err| ObjectStorageError::ConnectionError(Box::new(err)))?; + // Record single file written + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "PUT"]) + .inc(); + Ok(()) } @@ -627,15 +916,54 @@ impl ObjectStorage for BlobStore { } async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { - Ok(self.client.delete(&to_object_store_path(path)).await?) + let delete_start = Instant::now(); + let result = self.client.delete(&to_object_store_path(path)).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "DELETE", "200"]) + .observe(delete_elapsed); + // Record single file deleted + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "DELETE"]) + .inc(); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "DELETE", status_code]) + .observe(delete_elapsed); + } + } + + Ok(result?) } async fn check(&self) -> Result<(), ObjectStorageError> { - Ok(self + let head_start = Instant::now(); + let result = self .client .head(&to_object_store_path(&parseable_json_path())) - .await - .map(|_| ())?) + .await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "HEAD", "200"]) + .observe(head_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "HEAD", status_code]) + .observe(head_elapsed); + } + } + + Ok(result.map(|_| ())?) } async fn delete_stream(&self, stream_name: &str) -> Result<(), ObjectStorageError> { @@ -646,9 +974,24 @@ impl ObjectStorage for BlobStore { async fn try_delete_node_meta(&self, node_filename: String) -> Result<(), ObjectStorageError> { let file = RelativePathBuf::from(&node_filename); - match self.client.delete(&to_object_store_path(&file)).await { - Ok(_) => Ok(()), + + let delete_start = Instant::now(); + let result = self.client.delete(&to_object_store_path(&file)).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "DELETE", "200"]) + .observe(delete_elapsed); + Ok(()) + } Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "DELETE", status_code]) + .observe(delete_elapsed); + // if the object is not found, it is not an error // the given url path was incorrect if matches!(err, object_store::Error::NotFound { .. }) { @@ -667,7 +1010,13 @@ impl ObjectStorage for BlobStore { } async fn list_old_streams(&self) -> Result, ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(None).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs @@ -683,7 +1032,27 @@ impl ObjectStorage for BlobStore { for dir in &dirs { let key = format!("{dir}/{STREAM_METADATA_FILE_NAME}"); - let task = async move { self.client.head(&StorePath::from(key)).await.map(|_| ()) }; + let task = async move { + let head_start = Instant::now(); + let result = self.client.head(&StorePath::from(key)).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "HEAD", "200"]) + .observe(head_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "HEAD", status_code]) + .observe(head_elapsed); + } + } + + result.map(|_| ()) + }; stream_json_check.push(task); } @@ -795,7 +1164,26 @@ impl ObjectStorage for BlobStore { async fn list_dirs(&self) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from("/"); - let resp = self.client.list_with_delimiter(Some(&pre)).await?; + + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&pre)).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; Ok(resp .common_prefixes @@ -810,7 +1198,26 @@ impl ObjectStorage for BlobStore { relative_path: &RelativePath, ) -> Result, ObjectStorageError> { let prefix = object_store::path::Path::from(relative_path.as_str()); - let resp = self.client.list_with_delimiter(Some(&prefix)).await?; + + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&prefix)).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; Ok(resp .common_prefixes diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index 8171344f5..177f8f57d 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -25,7 +25,7 @@ use std::{ use crate::{ handlers::http::users::USERS_ROOT_DIR, - metrics::storage::{StorageMetrics, gcs::REQUEST_RESPONSE_TIME}, + metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, parseable::LogStream, }; use async_trait::async_trait; @@ -53,7 +53,7 @@ use super::{ CONNECT_TIMEOUT_SECS, MIN_MULTIPART_UPLOAD_SIZE, ObjectStorage, ObjectStorageError, ObjectStorageProvider, PARSEABLE_ROOT_DIRECTORY, REQUEST_TIMEOUT_SECS, SCHEMA_FILE_NAME, STREAM_METADATA_FILE_NAME, STREAM_ROOT_DIRECTORY, metrics_layer::MetricLayer, - object_storage::parseable_json_path, to_object_store_path, + metrics_layer::error_to_status_code, object_storage::parseable_json_path, to_object_store_path, }; #[derive(Debug, Clone, clap::Args)] @@ -129,7 +129,7 @@ impl ObjectStorageProvider for GcsConfig { // limit objectstore to a concurrent request limit let gcs = LimitStore::new(gcs, super::MAX_OBJECT_STORE_REQUESTS); - let gcs = MetricLayer::new(gcs); + let gcs = MetricLayer::new(gcs, "gcs"); let object_store_registry = DefaultObjectStoreRegistry::new(); // Register GCS client under the "gs://" scheme so DataFusion can route @@ -175,24 +175,23 @@ pub struct Gcs { impl Gcs { async fn _get_object(&self, path: &RelativePath) -> Result { - let instant = Instant::now(); - + let get_start = Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; + let get_elapsed = get_start.elapsed().as_secs_f64(); match resp { Ok(resp) => { - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); let body = resp.bytes().await.unwrap(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "GET", "200"]) + .observe(get_elapsed); Ok(body) } Err(err) => { - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "400"]) - .observe(time); + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "GET", status_code]) + .observe(get_elapsed); Err(err.into()) } } @@ -203,39 +202,73 @@ impl Gcs { path: &RelativePath, resource: PutPayload, ) -> Result<(), ObjectStorageError> { - let time = Instant::now(); + let put_start = Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let status = if resp.is_ok() { "200" } else { "400" }; - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["PUT", status]) - .observe(time); + let put_elapsed = put_start.elapsed().as_secs_f64(); if let Err(object_store::Error::NotFound { source, .. }) = &resp { let source_str = source.to_string(); if source_str.contains("NoSuchBucket") { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT", "404"]) + .observe(put_elapsed); return Err(ObjectStorageError::Custom( format!("Bucket '{}' does not exist in GCS.", self.bucket).to_string(), )); } } - resp.map(|_| ()).map_err(|err| err.into()) + match resp { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT", "200"]) + .observe(put_elapsed); + Ok(()) + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT", status_code]) + .observe(put_elapsed); + Err(err.into()) + } + } } async fn _delete_prefix(&self, key: &str) -> Result<(), ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); object_stream .for_each_concurrent(None, |x| async { match x { Ok(obj) => { - if (self.client.delete(&obj.location).await).is_err() { - error!("Failed to fetch object during delete stream"); + // Track individual DELETE operation + let delete_start = Instant::now(); + match self.client.delete(&obj.location).await { + Ok(_) => { + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "DELETE", "200"]) + .observe(delete_elapsed); + } + Err(err) => { + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "DELETE", status_code]) + .observe(delete_elapsed); + error!("Failed to delete object during delete stream: {:?}", err); + } } } - Err(_) => { - error!("Failed to fetch object during delete stream"); + Err(err) => { + error!("Failed to fetch object during delete stream: {:?}", err); } }; }) @@ -246,7 +279,27 @@ impl Gcs { async fn _list_streams(&self) -> Result, ObjectStorageError> { let mut result_file_list = HashSet::new(); - let resp = self.client.list_with_delimiter(None).await?; + + // Track initial LIST operation + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(None).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; let streams = resp .common_prefixes @@ -259,13 +312,35 @@ impl Gcs { for stream in streams { let stream_path = object_store::path::Path::from(format!("{}/{}", &stream, STREAM_ROOT_DIRECTORY)); - let resp = self.client.list_with_delimiter(Some(&stream_path)).await?; - if resp - .objects - .iter() - .any(|name| name.location.filename().unwrap().ends_with("stream.json")) - { - result_file_list.insert(stream); + + // Track individual LIST operations for each stream + let stream_list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&stream_path)).await; + let stream_list_elapsed = stream_list_start.elapsed().as_secs_f64(); + + match &resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(stream_list_elapsed); + + if resp + .objects + .iter() + .any(|name| name.location.filename().unwrap().ends_with("stream.json")) + { + result_file_list.insert(stream); + } + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(stream_list_elapsed); + return Err(ObjectStorageError::UnhandledError(Box::new( + std::io::Error::other(format!("List operation failed: {}", err)), + ))); + } } } @@ -273,10 +348,29 @@ impl Gcs { } async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let resp = self .client .list_with_delimiter(Some(&(stream.into()))) - .await?; + .await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; let common_prefixes = resp.common_prefixes; @@ -295,10 +389,30 @@ impl Gcs { stream: &str, ) -> Result>, ObjectStorageError> { let mut result_file_list: BTreeMap> = BTreeMap::new(); + + // Track initial LIST operation + let list_start = Instant::now(); let resp = self .client .list_with_delimiter(Some(&(stream.into()))) - .await?; + .await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; let dates = resp .common_prefixes @@ -307,32 +421,63 @@ impl Gcs { .filter(|name| name.as_ref() != stream && name.as_ref() != STREAM_ROOT_DIRECTORY) .map(|name| name.as_ref().to_string()) .collect::>(); + for date in dates { let date_path = object_store::path::Path::from(format!("{}/{}", stream, &date)); - let resp = self.client.list_with_delimiter(Some(&date_path)).await?; - let manifests: Vec = resp - .objects - .iter() - .filter(|name| name.location.filename().unwrap().ends_with("manifest.json")) - .map(|name| name.location.to_string()) - .collect(); - result_file_list.entry(date).or_default().extend(manifests); + + // Track individual LIST operation for each date + let date_list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&date_path)).await; + let date_list_elapsed = date_list_start.elapsed().as_secs_f64(); + + match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(date_list_elapsed); + + let manifests: Vec = resp + .objects + .iter() + .filter(|name| name.location.filename().unwrap().ends_with("manifest.json")) + .map(|name| name.location.to_string()) + .collect(); + result_file_list.entry(date).or_default().extend(manifests); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(date_list_elapsed); + return Err(err.into()); + } + } } Ok(result_file_list) } async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - let instant = Instant::now(); - let bytes = tokio::fs::read(path).await?; - let result = self.client.put(&key.into(), bytes.into()).await?; - info!("Uploaded file to GCS: {:?}", result); - - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["UPLOAD_PARQUET", "200"]) - .observe(time); - Ok(()) + let put_start = Instant::now(); + let result = self.client.put(&key.into(), bytes.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + + match result { + Ok(result) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT", "200"]) + .observe(put_elapsed); + info!("Uploaded file to GCS: {:?}", result); + Ok(()) + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT", status_code]) + .observe(put_elapsed); + Err(err.into()) + } + } } async fn _upload_multipart( @@ -343,14 +488,52 @@ impl Gcs { let mut file = OpenOptions::new().read(true).open(path).await?; let location = &to_object_store_path(key); - let mut async_writer = self.client.put_multipart(location).await?; + // Track multipart initiation + let multipart_start = Instant::now(); + let async_writer = self.client.put_multipart(location).await; + let multipart_elapsed = multipart_start.elapsed().as_secs_f64(); + + let mut async_writer = match async_writer { + Ok(writer) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT_MULTIPART_INIT", "200"]) + .observe(multipart_elapsed); + writer + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT_MULTIPART_INIT", status_code]) + .observe(multipart_elapsed); + return Err(err.into()); + } + }; let meta = file.metadata().await?; let total_size = meta.len() as usize; if total_size < MIN_MULTIPART_UPLOAD_SIZE { let mut data = Vec::new(); file.read_to_end(&mut data).await?; - self.client.put(location, data.into()).await?; + + // Track single PUT operation for small files + let put_start = Instant::now(); + let result = self.client.put(location, data.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT", "200"]) + .observe(put_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT", status_code]) + .observe(put_elapsed); + return Err(err.into()); + } + } return Ok(()); } else { let mut data = Vec::new(); @@ -360,7 +543,7 @@ impl Gcs { let num_full_parts = total_size / MIN_MULTIPART_UPLOAD_SIZE; let total_parts = num_full_parts + if has_final_partial_part { 1 } else { 0 }; - // Upload each part + // Upload each part with metrics for part_number in 0..(total_parts) { let start_pos = part_number * MIN_MULTIPART_UPLOAD_SIZE; let end_pos = if part_number == num_full_parts && has_final_partial_part { @@ -374,10 +557,37 @@ impl Gcs { // Extract this part's data let part_data = data[start_pos..end_pos].to_vec(); - // Upload the part - async_writer.put_part(part_data.into()).await?; + // Track individual part upload + let part_start = Instant::now(); + let result = async_writer.put_part(part_data.into()).await; + let part_elapsed = part_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT_MULTIPART_PART", "200"]) + .observe(part_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT_MULTIPART_PART", status_code]) + .observe(part_elapsed); + return Err(err.into()); + } + } } - if let Err(err) = async_writer.complete().await { + + // Track multipart completion + let complete_start = Instant::now(); + let complete_result = async_writer.complete().await; + let complete_elapsed = complete_start.elapsed().as_secs_f64(); + + if let Err(err) = complete_result { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT_MULTIPART_COMPLETE", status_code]) + .observe(complete_elapsed); if let Err(abort_err) = async_writer.abort().await { error!( "Failed to abort multipart upload after completion failure: {:?}", @@ -385,7 +595,11 @@ impl Gcs { ); } return Err(err.into()); - }; + } else { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "PUT_MULTIPART_COMPLETE", "200"]) + .observe(complete_elapsed); + } } Ok(()) } @@ -398,7 +612,26 @@ impl ObjectStorage for Gcs { path: &RelativePath, ) -> Result { let path = &to_object_store_path(path); - let meta = self.client.head(path).await?; + + let head_start = Instant::now(); + let meta = self.client.head(path).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + let meta = match meta { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", "200"]) + .observe(head_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", status_code]) + .observe(head_elapsed); + return Err(err.into()); + } + }; let store: Arc = self.client.clone(); let buf = object_store::buffered::BufReader::new(store, &meta); @@ -412,11 +645,40 @@ impl ObjectStorage for Gcs { self._upload_multipart(key, path).await } async fn head(&self, path: &RelativePath) -> Result { - Ok(self.client.head(&to_object_store_path(path)).await?) + let head_start = Instant::now(); + let result = self.client.head(&to_object_store_path(path)).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", "200"]) + .observe(head_elapsed); + // Record single file accessed + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "HEAD"]) + .inc(); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", status_code]) + .observe(head_elapsed); + } + } + + Ok(result?) } async fn get_object(&self, path: &RelativePath) -> Result { - Ok(self._get_object(path).await?) + let result = self._get_object(path).await?; + + // Record single file accessed + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "GET"]) + .inc(); + + Ok(result) } async fn get_objects( @@ -424,19 +686,40 @@ impl ObjectStorage for Gcs { base_path: Option<&RelativePath>, filter_func: Box bool + Send>, ) -> Result, ObjectStorageError> { - let instant = Instant::now(); - let prefix = if let Some(base_path) = base_path { to_object_store_path(base_path) } else { self.root.clone() }; + // Track list operation + let list_start = Instant::now(); let mut list_stream = self.client.list(Some(&prefix)); let mut res = vec![]; + let mut files_scanned = 0; + + // Note: We track each streaming list item retrieval + while let Some(meta_result) = list_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; - while let Some(meta) = list_stream.next().await.transpose()? { + files_scanned += 1; let ingestor_file = filter_func(meta.location.filename().unwrap().to_string()); if !ingestor_file { @@ -453,10 +736,10 @@ impl ObjectStorage for Gcs { res.push(byts); } - let instant = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(instant); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "GET"]) + .inc_by(files_scanned as f64); Ok(res) } @@ -464,11 +747,33 @@ impl ObjectStorage for Gcs { async fn get_ingestor_meta_file_paths( &self, ) -> Result, ObjectStorageError> { - let time = Instant::now(); let mut path_arr = vec![]; + let mut files_scanned = 0; + + // Track list operation + let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&self.root)); - while let Some(meta) = object_stream.next().await.transpose()? { + while let Some(meta_result) = object_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + files_scanned += 1; let flag = meta.location.filename().unwrap().starts_with("ingestor"); if flag { @@ -476,10 +781,10 @@ impl ObjectStorage for Gcs { } } - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -488,12 +793,34 @@ impl ObjectStorage for Gcs { &self, stream_name: &str, ) -> Result, ObjectStorageError> { - let time = Instant::now(); let mut path_arr = vec![]; + let mut files_scanned = 0; let path = to_object_store_path(&RelativePathBuf::from(stream_name)); + + // Track list operation + let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&path)); - while let Some(meta) = object_stream.next().await.transpose()? { + while let Some(meta_result) = object_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + files_scanned += 1; let flag = meta.location.filename().unwrap().starts_with(".ingestor"); if flag { @@ -507,10 +834,10 @@ impl ObjectStorage for Gcs { ])); path_arr.push(RelativePathBuf::from_iter([stream_name, SCHEMA_FILE_NAME])); - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -524,6 +851,11 @@ impl ObjectStorage for Gcs { .await .map_err(|err| ObjectStorageError::ConnectionError(Box::new(err)))?; + // Record single file written + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "PUT"]) + .inc(); + Ok(()) } @@ -534,7 +866,14 @@ impl ObjectStorage for Gcs { } async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { - Ok(self.client.delete(&to_object_store_path(path)).await?) + let result = self.client.delete(&to_object_store_path(path)).await?; + + // Record single file deleted + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "DELETE"]) + .inc(); + + Ok(result) } async fn check(&self) -> Result<(), ObjectStorageError> { @@ -553,9 +892,24 @@ impl ObjectStorage for Gcs { async fn try_delete_node_meta(&self, node_filename: String) -> Result<(), ObjectStorageError> { let file = RelativePathBuf::from(&node_filename); - match self.client.delete(&to_object_store_path(&file)).await { - Ok(_) => Ok(()), + + let delete_start = Instant::now(); + let result = self.client.delete(&to_object_store_path(&file)).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "DELETE", "200"]) + .observe(delete_elapsed); + Ok(()) + } Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "DELETE", status_code]) + .observe(delete_elapsed); + // if the object is not found, it is not an error // the given url path was incorrect if matches!(err, object_store::Error::NotFound { .. }) { @@ -574,7 +928,13 @@ impl ObjectStorage for Gcs { } async fn list_old_streams(&self) -> Result, ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(None).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs @@ -590,7 +950,27 @@ impl ObjectStorage for Gcs { for dir in &dirs { let key = format!("{dir}/{STREAM_METADATA_FILE_NAME}"); - let task = async move { self.client.head(&StorePath::from(key)).await.map(|_| ()) }; + let task = async move { + let head_start = Instant::now(); + let result = self.client.head(&StorePath::from(key)).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", "200"]) + .observe(head_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", status_code]) + .observe(head_elapsed); + } + } + + result.map(|_| ()) + }; stream_json_check.push(task); } @@ -698,7 +1078,26 @@ impl ObjectStorage for Gcs { async fn list_dirs(&self) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from("/"); - let resp = self.client.list_with_delimiter(Some(&pre)).await?; + + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&pre)).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; Ok(resp .common_prefixes @@ -713,7 +1112,26 @@ impl ObjectStorage for Gcs { relative_path: &RelativePath, ) -> Result, ObjectStorageError> { let prefix = object_store::path::Path::from(relative_path.as_str()); - let resp = self.client.list_with_delimiter(Some(&prefix)).await?; + + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&prefix)).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; Ok(resp .common_prefixes diff --git a/src/storage/localfs.rs b/src/storage/localfs.rs index 82eca88fe..2822b6b68 100644 --- a/src/storage/localfs.rs +++ b/src/storage/localfs.rs @@ -38,7 +38,7 @@ use tokio_stream::wrappers::ReadDirStream; use crate::{ handlers::http::users::USERS_ROOT_DIR, - metrics::storage::{StorageMetrics, azureblob::REQUEST_RESPONSE_TIME}, + metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, option::validation, parseable::LogStream, storage::SETTINGS_ROOT_DIRECTORY, @@ -130,6 +130,11 @@ impl ObjectStorage for LocalFS { ))) } async fn head(&self, _path: &RelativePath) -> Result { + // Record attempt to access file (even though operation not implemented) + STORAGE_FILES_SCANNED + .with_label_values(&["localfs", "HEAD"]) + .inc(); + Err(ObjectStorageError::UnhandledError(Box::new( std::io::Error::new( std::io::ErrorKind::Unsupported, @@ -138,35 +143,70 @@ impl ObjectStorage for LocalFS { ))) } async fn get_object(&self, path: &RelativePath) -> Result { - let time = Instant::now(); let file_path = self.path_in_root(path); - let res: Result = match fs::read(file_path).await { - Ok(x) => Ok(x.into()), + + let get_start = Instant::now(); + let file_result = fs::read(file_path).await; + let get_elapsed = get_start.elapsed().as_secs_f64(); + + let res: Result = match file_result { + Ok(x) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "GET", "200"]) + .observe(get_elapsed); + // Record single file accessed successfully + STORAGE_FILES_SCANNED + .with_label_values(&["localfs", "GET"]) + .inc(); + Ok(x.into()) + } Err(e) => match e.kind() { std::io::ErrorKind::NotFound => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "GET", "404"]) + .observe(get_elapsed); Err(ObjectStorageError::NoSuchKey(path.to_string())) } - _ => Err(ObjectStorageError::UnhandledError(Box::new(e))), + _ => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "GET", "500"]) + .observe(get_elapsed); + Err(ObjectStorageError::UnhandledError(Box::new(e))) + } }, }; - let status = if res.is_ok() { "200" } else { "400" }; - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", status]) - .observe(time); res } async fn get_ingestor_meta_file_paths( &self, ) -> Result, ObjectStorageError> { - let time = Instant::now(); - let mut path_arr = vec![]; - let mut entries = fs::read_dir(&self.root).await?; + let mut files_scanned = 0u64; + + // Track list operation + let list_start = Instant::now(); + let entries_result = fs::read_dir(&self.root).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let mut entries = match entries_result { + Ok(entries) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + entries + } + Err(err) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "404"]) + .observe(list_elapsed); + return Err(err.into()); + } + }; while let Some(entry) = entries.next_entry().await? { + files_scanned += 1; let flag = entry .path() .file_name() @@ -183,10 +223,10 @@ impl ObjectStorage for LocalFS { } } - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) // this might not be the right status code - .observe(time); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["localfs", "LIST"]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -195,14 +235,34 @@ impl ObjectStorage for LocalFS { &self, stream_name: &str, ) -> Result, ObjectStorageError> { - let time = Instant::now(); let mut path_arr = vec![]; + let mut files_scanned = 0u64; // = data/stream_name let stream_dir_path = self.path_in_root(&RelativePathBuf::from(stream_name)); - let mut entries = fs::read_dir(&stream_dir_path).await?; + + // Track list operation + let list_start = Instant::now(); + let entries_result = fs::read_dir(&stream_dir_path).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let mut entries = match entries_result { + Ok(entries) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + entries + } + Err(err) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "404"]) + .observe(list_elapsed); + return Err(err.into()); + } + }; while let Some(entry) = entries.next_entry().await? { + files_scanned += 1; let flag = entry .path() .file_name() @@ -227,10 +287,10 @@ impl ObjectStorage for LocalFS { ])); path_arr.push(RelativePathBuf::from_iter([stream_name, SCHEMA_FILE_NAME])); - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) // this might not be the right status code - .observe(time); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["localfs", "LIST"]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -241,16 +301,33 @@ impl ObjectStorage for LocalFS { base_path: Option<&RelativePath>, filter_func: Box<(dyn Fn(String) -> bool + std::marker::Send + 'static)>, ) -> Result, ObjectStorageError> { - let time = Instant::now(); - + let list_start = Instant::now(); let prefix = if let Some(path) = base_path { path.to_path(&self.root) } else { self.root.clone() }; - let mut entries = fs::read_dir(&prefix).await?; + let entries_result = fs::read_dir(&prefix).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let mut entries = match entries_result { + Ok(entries) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + entries + } + Err(err) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "404"]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + let mut res = Vec::new(); + let mut files_scanned = 0; while let Some(entry) = entries.next_entry().await? { let path = entry .path() @@ -261,22 +338,40 @@ impl ObjectStorage for LocalFS { .to_str() .expect("file name is parseable to str") .to_owned(); + + files_scanned += 1; let ingestor_file = filter_func(path); if !ingestor_file { continue; } - let file = fs::read(entry.path()).await?; - res.push(file.into()); + let file_read_start = Instant::now(); + let file_result = fs::read(entry.path()).await; + let file_read_elapsed = file_read_start.elapsed().as_secs_f64(); + + match file_result { + Ok(file) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "GET", "200"]) + .observe(file_read_elapsed); + res.push(file.into()); + } + Err(err) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "GET", "404"]) + .observe(file_read_elapsed); + return Err(err.into()); + } + } } + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["localfs", "GET"]) + .inc_by(files_scanned as f64); + // maybe change the return code - let status = if res.is_empty() { "200" } else { "400" }; - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", status]) - .observe(time); Ok(res) } @@ -286,52 +381,182 @@ impl ObjectStorage for LocalFS { path: &RelativePath, resource: Bytes, ) -> Result<(), ObjectStorageError> { - let time = Instant::now(); - let path = self.path_in_root(path); if let Some(parent) = path.parent() { fs::create_dir_all(parent).await?; } - let res = fs::write(path, resource).await; - let status = if res.is_ok() { "200" } else { "400" }; - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["PUT", status]) - .observe(time); + let put_start = Instant::now(); + let res = fs::write(path, resource).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + + match &res { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "PUT", "200"]) + .observe(put_elapsed); + // Record single file written successfully + STORAGE_FILES_SCANNED + .with_label_values(&["localfs", "PUT"]) + .inc(); + } + Err(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "PUT", "500"]) + .observe(put_elapsed); + } + } res.map_err(Into::into) } async fn delete_prefix(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { let path = self.path_in_root(path); - tokio::fs::remove_dir_all(path).await?; + + let delete_start = Instant::now(); + let result = tokio::fs::remove_dir_all(path).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", "200"]) + .observe(delete_elapsed); + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", status_code]) + .observe(delete_elapsed); + } + } + + result?; Ok(()) } async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { let path = self.path_in_root(path); - tokio::fs::remove_file(path).await?; + + let delete_start = Instant::now(); + let result = tokio::fs::remove_file(path).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", "200"]) + .observe(delete_elapsed); + // Record single file deleted successfully + STORAGE_FILES_SCANNED + .with_label_values(&["localfs", "DELETE"]) + .inc(); + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", status_code]) + .observe(delete_elapsed); + } + } + + result?; Ok(()) } async fn check(&self) -> Result<(), ObjectStorageError> { - fs::create_dir_all(&self.root) - .await - .map_err(|e| ObjectStorageError::UnhandledError(e.into())) + let check_start = Instant::now(); + let result = fs::create_dir_all(&self.root).await; + let check_elapsed = check_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "HEAD", "200"]) + .observe(check_elapsed); + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::PermissionDenied => "403", + std::io::ErrorKind::NotFound => "404", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "HEAD", status_code]) + .observe(check_elapsed); + } + } + + result.map_err(|e| ObjectStorageError::UnhandledError(e.into())) } async fn delete_stream(&self, stream_name: &str) -> Result<(), ObjectStorageError> { let path = self.root.join(stream_name); - Ok(fs::remove_dir_all(path).await?) + + let delete_start = Instant::now(); + let result = fs::remove_dir_all(path).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", "200"]) + .observe(delete_elapsed); + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", status_code]) + .observe(delete_elapsed); + } + } + + Ok(result?) } async fn try_delete_node_meta(&self, node_filename: String) -> Result<(), ObjectStorageError> { let path = self.root.join(node_filename); - Ok(fs::remove_file(path).await?) + + let delete_start = Instant::now(); + let result = fs::remove_file(path).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", "200"]) + .observe(delete_elapsed); + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "DELETE", status_code]) + .observe(delete_elapsed); + } + } + + Ok(result?) } async fn list_streams(&self) -> Result, ObjectStorageError> { + let list_start = Instant::now(); + let ignore_dir = &[ "lost+found", PARSEABLE_ROOT_DIRECTORY, @@ -339,7 +564,30 @@ impl ObjectStorage for LocalFS { ALERTS_ROOT_DIRECTORY, SETTINGS_ROOT_DIRECTORY, ]; - let directories = ReadDirStream::new(fs::read_dir(&self.root).await?); + + let result = fs::read_dir(&self.root).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let directories = match result { + Ok(read_dir) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + ReadDirStream::new(read_dir) + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + let entries: Vec = directories.try_collect().await?; let entries = entries .into_iter() @@ -354,13 +602,38 @@ impl ObjectStorage for LocalFS { } async fn list_old_streams(&self) -> Result, ObjectStorageError> { + let list_start = Instant::now(); + let ignore_dir = &[ "lost+found", PARSEABLE_ROOT_DIRECTORY, ALERTS_ROOT_DIRECTORY, SETTINGS_ROOT_DIRECTORY, ]; - let directories = ReadDirStream::new(fs::read_dir(&self.root).await?); + + let result = fs::read_dir(&self.root).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let directories = match result { + Ok(read_dir) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + ReadDirStream::new(read_dir) + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + let entries: Vec = directories.try_collect().await?; let entries = entries .into_iter() @@ -375,7 +648,31 @@ impl ObjectStorage for LocalFS { } async fn list_dirs(&self) -> Result, ObjectStorageError> { - let dirs = ReadDirStream::new(fs::read_dir(&self.root).await?) + let list_start = Instant::now(); + let result = fs::read_dir(&self.root).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let read_dir = match result { + Ok(read_dir) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + read_dir + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + let dirs = ReadDirStream::new(read_dir) .try_collect::>() .await? .into_iter() @@ -396,7 +693,32 @@ impl ObjectStorage for LocalFS { relative_path: &RelativePath, ) -> Result, ObjectStorageError> { let root = self.root.join(relative_path.as_str()); - let dirs = ReadDirStream::new(fs::read_dir(root).await?) + + let list_start = Instant::now(); + let result = fs::read_dir(root).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let read_dir = match result { + Ok(read_dir) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + read_dir + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + let dirs = ReadDirStream::new(read_dir) .try_collect::>() .await? .into_iter() @@ -414,7 +736,32 @@ impl ObjectStorage for LocalFS { async fn list_dates(&self, stream_name: &str) -> Result, ObjectStorageError> { let path = self.root.join(stream_name); - let directories = ReadDirStream::new(fs::read_dir(&path).await?); + + let list_start = Instant::now(); + let result = fs::read_dir(&path).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let read_dir = match result { + Ok(read_dir) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", "200"]) + .observe(list_elapsed); + read_dir + } + Err(err) => { + let status_code = match err.kind() { + std::io::ErrorKind::NotFound => "404", + std::io::ErrorKind::PermissionDenied => "403", + _ => "500", + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + let directories = ReadDirStream::new(read_dir); let entries: Vec = directories.try_collect().await?; let entries = entries.into_iter().map(dir_name); let dates: Vec<_> = FuturesUnordered::from_iter(entries).try_collect().await?; @@ -468,6 +815,7 @@ impl ObjectStorage for LocalFS { } async fn upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { + let upload_start = Instant::now(); let op = CopyOptions { overwrite: true, skip_exist: true, @@ -477,8 +825,24 @@ impl ObjectStorage for LocalFS { if let Some(path) = to_path.parent() { fs::create_dir_all(path).await?; } - let _ = fs_extra::file::copy(path, to_path, &op)?; - Ok(()) + + let result = fs_extra::file::copy(path, to_path, &op); + let upload_elapsed = upload_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "PUT", "200"]) + .observe(upload_elapsed); + Ok(()) + } + Err(err) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["localfs", "PUT", "500"]) + .observe(upload_elapsed); + Err(err.into()) + } + } } fn absolute_url(&self, prefix: &RelativePath) -> object_store::path::Path { diff --git a/src/storage/metrics_layer.rs b/src/storage/metrics_layer.rs index cfaaeb6d2..6de1d9e64 100644 --- a/src/storage/metrics_layer.rs +++ b/src/storage/metrics_layer.rs @@ -34,16 +34,49 @@ use object_store::{ use object_store::{MultipartUpload, PutMultipartOpts, PutPayload} */ -use crate::metrics::storage::s3::QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME; +use crate::metrics::storage::STORAGE_REQUEST_RESPONSE_TIME; + +// Public helper function to map object_store errors to HTTP status codes +pub fn error_to_status_code(err: &object_store::Error) -> &'static str { + match err { + // 400 Bad Request - Client errors + object_store::Error::Generic { .. } => "400", + + // 401 Unauthorized - Authentication required + object_store::Error::Unauthenticated { .. } => "401", + + // 404 Not Found - Resource doesn't exist + object_store::Error::NotFound { .. } => "404", + + // 409 Conflict - Resource already exists + object_store::Error::AlreadyExists { .. } => "409", + + // 412 Precondition Failed - If-Match, If-None-Match, etc. failed + object_store::Error::Precondition { .. } => "412", + + // 304 Not Modified + object_store::Error::NotModified { .. } => "304", + + // 501 Not Implemented - Feature not supported + object_store::Error::NotSupported { .. } => "501", + + // 500 Internal Server Error - All other errors + _ => "500", + } +} #[derive(Debug)] pub struct MetricLayer { inner: T, + provider: String, } impl MetricLayer { - pub fn new(inner: T) -> Self { - Self { inner } + pub fn new(inner: T, provider: &str) -> Self { + Self { + inner, + provider: provider.to_string(), + } } } @@ -62,12 +95,18 @@ impl ObjectStore for MetricLayer { bytes: PutPayload, /* PutPayload */ ) -> ObjectStoreResult { let time = time::Instant::now(); - let put_result = self.inner.put(location, bytes).await?; + let put_result = self.inner.put(location, bytes).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["PUT", "200"]) + + let status = match &put_result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "PUT", status]) .observe(elapsed); - return Ok(put_result); + put_result } async fn put_opts( @@ -77,12 +116,18 @@ impl ObjectStore for MetricLayer { opts: PutOptions, ) -> ObjectStoreResult { let time = time::Instant::now(); - let put_result = self.inner.put_opts(location, payload, opts).await?; + let put_result = self.inner.put_opts(location, payload, opts).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["PUT_OPTS", "200"]) + + let status = match &put_result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "PUT_OPTS", status]) .observe(elapsed); - return Ok(put_result); + put_result } // // ! removed in object_store 0.10.0 @@ -94,7 +139,7 @@ impl ObjectStore for MetricLayer { // let time = time::Instant::now(); // let elapsed = time.elapsed().as_secs_f64(); // self.inner.abort_multipart(location, multipart_id).await?; - // QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME + // STORAGE_REQUEST_RESPONSE_TIME // .with_label_values(&["PUT_MULTIPART_ABORT", "200"]) // .observe(elapsed); // Ok(()) @@ -107,56 +152,84 @@ impl ObjectStore for MetricLayer { opts: PutMultipartOpts, ) -> ObjectStoreResult> { let time = time::Instant::now(); - let multipart_upload = self.inner.put_multipart_opts(location, opts).await?; + let result = self.inner.put_multipart_opts(location, opts).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["PUT_MULTIPART_OPTS", "200"]) - .observe(elapsed); - Ok(multipart_upload) + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "PUT_MULTIPART_OPTS", status]) + .observe(elapsed); + result } // todo completly tracking multipart upload async fn put_multipart(&self, location: &Path) -> ObjectStoreResult> /* ObjectStoreResult<(MultipartId, Box)> */ { let time = time::Instant::now(); - let multipart_upload = self.inner.put_multipart(location).await?; + let result = self.inner.put_multipart(location).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["PUT_MULTIPART", "200"]) - .observe(elapsed); - Ok(multipart_upload) + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "PUT_MULTIPART", status]) + .observe(elapsed); + result } async fn get(&self, location: &Path) -> ObjectStoreResult { let time = time::Instant::now(); - let res = self.inner.get(location).await?; + let get_result = self.inner.get(location).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) + + let status = match &get_result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "GET", status]) .observe(elapsed); - Ok(res) + get_result } async fn get_opts(&self, location: &Path, options: GetOptions) -> ObjectStoreResult { let time = time::Instant::now(); - let res = self.inner.get_opts(location, options).await?; + let result = self.inner.get_opts(location, options).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["GET_OPTS", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "GET_OPTS", status]) .observe(elapsed); - Ok(res) + result } async fn get_range(&self, location: &Path, range: Range) -> ObjectStoreResult { let time = time::Instant::now(); - let res = self.inner.get_range(location, range).await?; + let result = self.inner.get_range(location, range).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["GET_RANGE", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "GET_RANGE", status]) .observe(elapsed); - Ok(res) + result } async fn get_ranges( @@ -165,32 +238,50 @@ impl ObjectStore for MetricLayer { ranges: &[Range], ) -> ObjectStoreResult> { let time = time::Instant::now(); - let res = self.inner.get_ranges(location, ranges).await?; + let result = self.inner.get_ranges(location, ranges).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["GET_RANGES", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "GET_RANGES", status]) .observe(elapsed); - Ok(res) + result } async fn head(&self, location: &Path) -> ObjectStoreResult { let time = time::Instant::now(); - let res = self.inner.head(location).await?; + let result = self.inner.head(location).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["HEAD", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "HEAD", status]) .observe(elapsed); - Ok(res) + result } async fn delete(&self, location: &Path) -> ObjectStoreResult<()> { let time = time::Instant::now(); - let res = self.inner.delete(location).await?; + let result = self.inner.delete(location).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["DELETE", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "DELETE", status]) .observe(elapsed); - Ok(res) + result } fn delete_stream<'a>( @@ -229,52 +320,82 @@ impl ObjectStore for MetricLayer { async fn list_with_delimiter(&self, prefix: Option<&Path>) -> ObjectStoreResult { let time = time::Instant::now(); - let res = self.inner.list_with_delimiter(prefix).await?; + let result = self.inner.list_with_delimiter(prefix).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["LIST_DELIM", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "LIST_DELIM", status]) .observe(elapsed); - Ok(res) + result } async fn copy(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { let time = time::Instant::now(); - let res = self.inner.copy(from, to).await?; + let result = self.inner.copy(from, to).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["COPY", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "COPY", status]) .observe(elapsed); - Ok(res) + result } async fn rename(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { let time = time::Instant::now(); - let res = self.inner.rename(from, to).await?; + let result = self.inner.rename(from, to).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["RENAME", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "RENAME", status]) .observe(elapsed); - Ok(res) + result } async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { let time = time::Instant::now(); - let res = self.inner.copy_if_not_exists(from, to).await?; + let result = self.inner.copy_if_not_exists(from, to).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["COPY_IF", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "COPY_IF", status]) .observe(elapsed); - Ok(res) + result } async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> ObjectStoreResult<()> { let time = time::Instant::now(); - let res = self.inner.rename_if_not_exists(from, to).await?; + let result = self.inner.rename_if_not_exists(from, to).await; let elapsed = time.elapsed().as_secs_f64(); - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["RENAME_IF", "200"]) + + let status = match &result { + Ok(_) => "200", + Err(err) => error_to_status_code(err), + }; + + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&[&self.provider, "RENAME_IF", status]) .observe(elapsed); - Ok(res) + result } } @@ -293,7 +414,7 @@ impl Stream for StreamMetricWrapper<'_, N, T> { ) -> Poll> { match self.inner.poll_next_unpin(cx) { t @ Poll::Ready(None) => { - QUERY_LAYER_STORAGE_REQUEST_RESPONSE_TIME + STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&self.labels) .observe(self.time.elapsed().as_secs_f64()); t diff --git a/src/storage/s3.rs b/src/storage/s3.rs index 824ab021a..9e28c46b1 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -21,7 +21,10 @@ use std::{ fmt::Display, path::Path, str::FromStr, - sync::Arc, + sync::{ + Arc, + atomic::{AtomicU64, Ordering}, + }, time::{Duration, Instant}, }; @@ -48,7 +51,7 @@ use tracing::{error, info}; use crate::{ handlers::http::users::USERS_ROOT_DIR, - metrics::storage::{StorageMetrics, azureblob::REQUEST_RESPONSE_TIME}, + metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, parseable::LogStream, }; @@ -56,7 +59,7 @@ use super::{ CONNECT_TIMEOUT_SECS, MIN_MULTIPART_UPLOAD_SIZE, ObjectStorage, ObjectStorageError, ObjectStorageProvider, PARSEABLE_ROOT_DIRECTORY, REQUEST_TIMEOUT_SECS, SCHEMA_FILE_NAME, STREAM_METADATA_FILE_NAME, STREAM_ROOT_DIRECTORY, metrics_layer::MetricLayer, - object_storage::parseable_json_path, to_object_store_path, + metrics_layer::error_to_status_code, object_storage::parseable_json_path, to_object_store_path, }; // in bytes @@ -300,7 +303,7 @@ impl ObjectStorageProvider for S3Config { // limit objectstore to a concurrent request limit let s3 = LimitStore::new(s3, super::MAX_OBJECT_STORE_REQUESTS); - let s3 = MetricLayer::new(s3); + let s3 = MetricLayer::new(s3, "s3"); let object_store_registry = DefaultObjectStoreRegistry::new(); let url = ObjectStoreUrl::parse(format!("s3://{}", &self.bucket_name)).unwrap(); @@ -337,24 +340,24 @@ pub struct S3 { impl S3 { async fn _get_object(&self, path: &RelativePath) -> Result { - let instant = Instant::now(); - + let time = std::time::Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; + let elapsed = time.elapsed().as_secs_f64(); + match resp { Ok(resp) => { - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); let body = resp.bytes().await.unwrap(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "GET", "200"]) + .observe(elapsed); Ok(body) } Err(err) => { - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "400"]) - .observe(time); + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "GET", status_code]) + .observe(elapsed); Err(err.into()) } } @@ -365,50 +368,106 @@ impl S3 { path: &RelativePath, resource: PutPayload, ) -> Result<(), ObjectStorageError> { - let time = Instant::now(); + let time = std::time::Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let status = if resp.is_ok() { "200" } else { "400" }; - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["PUT", status]) - .observe(time); - - if let Err(object_store::Error::NotFound { source, .. }) = &resp { - let source_str = source.to_string(); - if source_str.contains("NoSuchBucket") { - return Err(ObjectStorageError::Custom( - format!("Bucket '{}' does not exist in S3.", self.bucket).to_string(), - )); + + let elapsed = time.elapsed().as_secs_f64(); + + match resp { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", "200"]) + .observe(elapsed); + Ok(()) + } + Err(err) => { + let status_code = match &err { + object_store::Error::NotFound { .. } => { + // Check for specific S3 bucket not found error + let source_str = err.to_string(); + if source_str.contains("NoSuchBucket") { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", "404"]) + .observe(elapsed); + return Err(ObjectStorageError::Custom( + format!("Bucket '{}' does not exist in S3.", self.bucket) + .to_string(), + )); + } + "404" + } + _ => error_to_status_code(&err), + }; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", status_code]) + .observe(elapsed); + Err(err.into()) } } - - resp.map(|_| ()).map_err(|err| err.into()) } async fn _delete_prefix(&self, key: &str) -> Result<(), ObjectStorageError> { + let files_scanned = Arc::new(AtomicU64::new(0)); + + // Track LIST operation + let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); + let files_scanned_clone = files_scanned.clone(); object_stream .for_each_concurrent(None, |x| async { match x { Ok(obj) => { - if (self.client.delete(&obj.location).await).is_err() { - error!("Failed to fetch object during delete stream"); + files_scanned_clone.fetch_add(1, Ordering::Relaxed); + // Track individual DELETE operation + let delete_start = Instant::now(); + match self.client.delete(&obj.location).await { + Ok(_) => { + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "DELETE", "200"]) + .observe(delete_elapsed); + } + Err(err) => { + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "DELETE", status_code]) + .observe(delete_elapsed); + error!("Failed to delete object during delete stream: {:?}", err); + } } } - Err(_) => { - error!("Failed to fetch object during delete stream"); + Err(err) => { + error!("Failed to fetch object during delete stream: {:?}", err); } }; }) .await; + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(files_scanned.load(Ordering::Relaxed) as f64); + Ok(()) } async fn _list_streams(&self) -> Result, ObjectStorageError> { let mut result_file_list = HashSet::new(); + let mut total_files_scanned = 0u64; + + // Track initial LIST operation + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(None).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); let streams = resp .common_prefixes @@ -421,27 +480,79 @@ impl S3 { for stream in streams { let stream_path = object_store::path::Path::from(format!("{}/{}", &stream, STREAM_ROOT_DIRECTORY)); - let resp = self.client.list_with_delimiter(Some(&stream_path)).await?; - if resp - .objects - .iter() - .any(|name| name.location.filename().unwrap().ends_with("stream.json")) - { - result_file_list.insert(stream); + + // Track individual LIST operations for each stream + let stream_list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&stream_path)).await; + let stream_list_elapsed = stream_list_start.elapsed().as_secs_f64(); + + match &resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(stream_list_elapsed); + + total_files_scanned += resp.objects.len() as u64; + if resp + .objects + .iter() + .any(|name| name.location.filename().unwrap().ends_with("stream.json")) + { + result_file_list.insert(stream); + } + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", status_code]) + .observe(stream_list_elapsed); + return Err(ObjectStorageError::UnhandledError(Box::new( + std::io::Error::other(format!("List operation failed: {}", err)), + ))); + } } } + // Record total files scanned across all operations + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(total_files_scanned as f64); + Ok(result_file_list) } async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let resp = self .client .list_with_delimiter(Some(&(stream.into()))) - .await?; + .await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; let common_prefixes = resp.common_prefixes; + // Record files scanned (prefixes/directories count as files scanned) + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(common_prefixes.len() as f64); + // return prefixes at the root level let dates: Vec<_> = common_prefixes .iter() @@ -457,10 +568,31 @@ impl S3 { stream: &str, ) -> Result>, ObjectStorageError> { let mut result_file_list: BTreeMap> = BTreeMap::new(); + let mut total_files_scanned = 0u64; + + // Track initial LIST operation + let list_start = Instant::now(); let resp = self .client .list_with_delimiter(Some(&(stream.into()))) - .await?; + .await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; let dates = resp .common_prefixes @@ -469,45 +601,81 @@ impl S3 { .filter(|name| name.as_ref() != stream && name.as_ref() != STREAM_ROOT_DIRECTORY) .map(|name| name.as_ref().to_string()) .collect::>(); + for date in dates { let date_path = object_store::path::Path::from(format!("{}/{}", stream, &date)); - let resp = self.client.list_with_delimiter(Some(&date_path)).await?; - let manifests: Vec = resp - .objects - .iter() - .filter(|name| name.location.filename().unwrap().ends_with("manifest.json")) - .map(|name| name.location.to_string()) - .collect(); - result_file_list.entry(date).or_default().extend(manifests); + + // Track individual LIST operation for each date + let date_list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&date_path)).await; + let date_list_elapsed = date_list_start.elapsed().as_secs_f64(); + + match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(date_list_elapsed); + + total_files_scanned += resp.objects.len() as u64; + let manifests: Vec = resp + .objects + .iter() + .filter(|name| name.location.filename().unwrap().ends_with("manifest.json")) + .map(|name| name.location.to_string()) + .collect(); + result_file_list.entry(date).or_default().extend(manifests); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", status_code]) + .observe(date_list_elapsed); + return Err(err.into()); + } + } } + + // Record total files scanned across all date operations + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(total_files_scanned as f64); + Ok(result_file_list) } async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - let instant = Instant::now(); - // // TODO: Uncomment this when multipart is fixed // let should_multipart = std::fs::metadata(path)?.len() > MULTIPART_UPLOAD_SIZE as u64; let should_multipart = false; - let res = if should_multipart { + if should_multipart { // self._upload_multipart(key, path).await // this branch will never get executed Ok(()) } else { let bytes = tokio::fs::read(path).await?; - let result = self.client.put(&key.into(), bytes.into()).await?; - info!("Uploaded file to S3: {:?}", result); - Ok(()) - }; - - let status = if res.is_ok() { "200" } else { "400" }; - let time = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["UPLOAD_PARQUET", status]) - .observe(time); - res + let put_start = Instant::now(); + let result = self.client.put(&key.into(), bytes.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + + match result { + Ok(result) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", "200"]) + .observe(put_elapsed); + info!("Uploaded file to S3: {:?}", result); + Ok(()) + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", status_code]) + .observe(put_elapsed); + Err(err.into()) + } + } + } } async fn _upload_multipart( @@ -518,14 +686,53 @@ impl S3 { let mut file = OpenOptions::new().read(true).open(path).await?; let location = &to_object_store_path(key); - let mut async_writer = self.client.put_multipart(location).await?; + // Track multipart initiation + let multipart_start = Instant::now(); + let async_writer = self.client.put_multipart(location).await; + let multipart_elapsed = multipart_start.elapsed().as_secs_f64(); + + let mut async_writer = match async_writer { + Ok(writer) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT_MULTIPART_INIT", "200"]) + .observe(multipart_elapsed); + writer + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT_MULTIPART_INIT", status_code]) + .observe(multipart_elapsed); + return Err(err.into()); + } + }; let meta = file.metadata().await?; let total_size = meta.len() as usize; if total_size < MIN_MULTIPART_UPLOAD_SIZE { let mut data = Vec::new(); file.read_to_end(&mut data).await?; - self.client.put(location, data.into()).await?; + + // Track single PUT operation for small files + let put_start = Instant::now(); + let result = self.client.put(location, data.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", "200"]) + .observe(put_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", status_code]) + .observe(put_elapsed); + return Err(err.into()); + } + } + // async_writer.put_part(data.into()).await?; // async_writer.complete().await?; return Ok(()); @@ -539,7 +746,7 @@ impl S3 { let num_full_parts = total_size / MIN_MULTIPART_UPLOAD_SIZE; let total_parts = num_full_parts + if has_final_partial_part { 1 } else { 0 }; - // Upload each part + // Upload each part with metrics for part_number in 0..(total_parts) { let start_pos = part_number * MIN_MULTIPART_UPLOAD_SIZE; let end_pos = if part_number == num_full_parts && has_final_partial_part { @@ -553,15 +760,47 @@ impl S3 { // Extract this part's data let part_data = data[start_pos..end_pos].to_vec(); - // Upload the part - async_writer.put_part(part_data.into()).await?; + // Track individual part upload + let part_start = Instant::now(); + let result = async_writer.put_part(part_data.into()).await; + let part_elapsed = part_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT_MULTIPART_PART", "200"]) + .observe(part_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT_MULTIPART_PART", status_code]) + .observe(part_elapsed); + return Err(err.into()); + } + } // upload_parts.push(part_number as u64 + 1); } - if let Err(err) = async_writer.complete().await { + + // Track multipart completion + let complete_start = Instant::now(); + let complete_result = async_writer.complete().await; + let complete_elapsed = complete_start.elapsed().as_secs_f64(); + + if let Err(err) = complete_result { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT_MULTIPART_COMPLETE", status_code]) + .observe(complete_elapsed); error!("Failed to complete multipart upload. {:?}", err); async_writer.abort().await?; - }; + return Err(err.into()); + } else { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT_MULTIPART_COMPLETE", "200"]) + .observe(complete_elapsed); + } } Ok(()) } @@ -574,7 +813,26 @@ impl ObjectStorage for S3 { path: &RelativePath, ) -> Result { let path = &to_object_store_path(path); - let meta = self.client.head(path).await?; + + let head_start = Instant::now(); + let meta = self.client.head(path).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + let meta = match meta { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", "200"]) + .observe(head_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", status_code]) + .observe(head_elapsed); + return Err(err.into()); + } + }; let store: Arc = Arc::new(self.client.clone()); let buf = object_store::buffered::BufReader::new(store, &meta); @@ -588,11 +846,40 @@ impl ObjectStorage for S3 { self._upload_multipart(key, path).await } async fn head(&self, path: &RelativePath) -> Result { - Ok(self.client.head(&to_object_store_path(path)).await?) + let head_start = Instant::now(); + let result = self.client.head(&to_object_store_path(path)).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", "200"]) + .observe(head_elapsed); + // Record single file accessed + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "HEAD"]) + .inc(); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", status_code]) + .observe(head_elapsed); + } + } + + Ok(result?) } async fn get_object(&self, path: &RelativePath) -> Result { - Ok(self._get_object(path).await?) + let result = self._get_object(path).await?; + + // Record single file accessed + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "GET"]) + .inc(); + + Ok(result) } async fn get_objects( @@ -600,19 +887,40 @@ impl ObjectStorage for S3 { base_path: Option<&RelativePath>, filter_func: Box bool + Send>, ) -> Result, ObjectStorageError> { - let instant = Instant::now(); - let prefix = if let Some(base_path) = base_path { to_object_store_path(base_path) } else { self.root.clone() }; + // Track list operation + let list_start = Instant::now(); let mut list_stream = self.client.list(Some(&prefix)); let mut res = vec![]; + let mut files_scanned = 0; + + // Note: We track each streaming list item retrieval + while let Some(meta_result) = list_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; - while let Some(meta) = list_stream.next().await.transpose()? { + files_scanned += 1; let ingestor_file = filter_func(meta.location.filename().unwrap().to_string()); if !ingestor_file { @@ -629,10 +937,10 @@ impl ObjectStorage for S3 { res.push(byts); } - let instant = instant.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(instant); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "GET"]) + .inc_by(files_scanned as f64); Ok(res) } @@ -640,11 +948,33 @@ impl ObjectStorage for S3 { async fn get_ingestor_meta_file_paths( &self, ) -> Result, ObjectStorageError> { - let time = Instant::now(); let mut path_arr = vec![]; + let mut files_scanned = 0; + + // Track list operation + let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&self.root)); - while let Some(meta) = object_stream.next().await.transpose()? { + while let Some(meta_result) = object_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + files_scanned += 1; let flag = meta.location.filename().unwrap().starts_with("ingestor"); if flag { @@ -652,10 +982,10 @@ impl ObjectStorage for S3 { } } - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -664,12 +994,34 @@ impl ObjectStorage for S3 { &self, stream_name: &str, ) -> Result, ObjectStorageError> { - let time = Instant::now(); let mut path_arr = vec![]; + let mut files_scanned = 0; let path = to_object_store_path(&RelativePathBuf::from(stream_name)); + + // Track list operation + let list_start = Instant::now(); let mut object_stream = self.client.list(Some(&path)); - while let Some(meta) = object_stream.next().await.transpose()? { + while let Some(meta_result) = object_stream.next().await { + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let meta = match meta_result { + Ok(meta) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); + meta + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; + + files_scanned += 1; let flag = meta.location.filename().unwrap().starts_with(".ingestor"); if flag { @@ -683,10 +1035,10 @@ impl ObjectStorage for S3 { ])); path_arr.push(RelativePathBuf::from_iter([stream_name, SCHEMA_FILE_NAME])); - let time = time.elapsed().as_secs_f64(); - REQUEST_RESPONSE_TIME - .with_label_values(&["GET", "200"]) - .observe(time); + // Record total files scanned + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -700,6 +1052,11 @@ impl ObjectStorage for S3 { .await .map_err(|err| ObjectStorageError::ConnectionError(Box::new(err)))?; + // Record single file written + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "PUT"]) + .inc(); + Ok(()) } @@ -710,15 +1067,54 @@ impl ObjectStorage for S3 { } async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { - Ok(self.client.delete(&to_object_store_path(path)).await?) + let delete_start = Instant::now(); + let result = self.client.delete(&to_object_store_path(path)).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "DELETE", "200"]) + .observe(delete_elapsed); + // Record single file deleted + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "DELETE"]) + .inc(); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "DELETE", status_code]) + .observe(delete_elapsed); + } + } + + Ok(result?) } async fn check(&self) -> Result<(), ObjectStorageError> { - Ok(self + let head_start = Instant::now(); + let result = self .client .head(&to_object_store_path(&parseable_json_path())) - .await - .map(|_| ())?) + .await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", "200"]) + .observe(head_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", status_code]) + .observe(head_elapsed); + } + } + + Ok(result.map(|_| ())?) } async fn delete_stream(&self, stream_name: &str) -> Result<(), ObjectStorageError> { @@ -729,9 +1125,24 @@ impl ObjectStorage for S3 { async fn try_delete_node_meta(&self, node_filename: String) -> Result<(), ObjectStorageError> { let file = RelativePathBuf::from(&node_filename); - match self.client.delete(&to_object_store_path(&file)).await { - Ok(_) => Ok(()), + + let delete_start = Instant::now(); + let result = self.client.delete(&to_object_store_path(&file)).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + + match result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "DELETE", "200"]) + .observe(delete_elapsed); + Ok(()) + } Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "DELETE", status_code]) + .observe(delete_elapsed); + // if the object is not found, it is not an error // the given url path was incorrect if matches!(err, object_store::Error::NotFound { .. }) { @@ -750,7 +1161,13 @@ impl ObjectStorage for S3 { } async fn list_old_streams(&self) -> Result, ObjectStorageError> { + // Track LIST operation + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(None).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs @@ -766,7 +1183,27 @@ impl ObjectStorage for S3 { for dir in &dirs { let key = format!("{dir}/{STREAM_METADATA_FILE_NAME}"); - let task = async move { self.client.head(&StorePath::from(key)).await.map(|_| ()) }; + let task = async move { + let head_start = Instant::now(); + let result = self.client.head(&StorePath::from(key)).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", "200"]) + .observe(head_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "HEAD", status_code]) + .observe(head_elapsed); + } + } + + result.map(|_| ()) + }; stream_json_check.push(task); } @@ -874,7 +1311,26 @@ impl ObjectStorage for S3 { async fn list_dirs(&self) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from("/"); - let resp = self.client.list_with_delimiter(Some(&pre)).await?; + + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&pre)).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; Ok(resp .common_prefixes @@ -889,7 +1345,26 @@ impl ObjectStorage for S3 { relative_path: &RelativePath, ) -> Result, ObjectStorageError> { let prefix = object_store::path::Path::from(relative_path.as_str()); - let resp = self.client.list_with_delimiter(Some(&prefix)).await?; + + let list_start = Instant::now(); + let resp = self.client.list_with_delimiter(Some(&prefix)).await; + let list_elapsed = list_start.elapsed().as_secs_f64(); + + let resp = match resp { + Ok(resp) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); + resp + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", status_code]) + .observe(list_elapsed); + return Err(err.into()); + } + }; Ok(resp .common_prefixes From 80f5d963c46a4db5cbfcc924dc51e2520460b8f2 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Sun, 24 Aug 2025 10:19:51 -0700 Subject: [PATCH 2/3] improve metrics collection --- src/catalog/mod.rs | 6 +- src/metadata.rs | 17 +- src/metrics/mod.rs | 95 +++++++-- src/query/listing_table_builder.rs | 80 ++----- src/query/mod.rs | 9 +- src/query/stream_schema_provider.rs | 77 ++++--- src/stats.rs | 11 +- src/storage/azure_blob.rs | 310 ++++++++++++++-------------- src/storage/gcs.rs | 280 ++++++++++++++----------- src/storage/object_storage.rs | 12 +- src/storage/s3.rs | 230 +++++++++------------ 11 files changed, 569 insertions(+), 558 deletions(-) diff --git a/src/catalog/mod.rs b/src/catalog/mod.rs index 750864077..15bbda2a7 100644 --- a/src/catalog/mod.rs +++ b/src/catalog/mod.rs @@ -189,17 +189,17 @@ fn extract_partition_metrics(stream_name: &str, partition_lower: DateTime) let events_ingested = EVENTS_INGESTED_DATE .get_metric_with_label_values(&event_labels) - .map(|metric| metric.get() as u64) + .map(|metric| metric.get()) .unwrap_or(0); let ingestion_size = EVENTS_INGESTED_SIZE_DATE .get_metric_with_label_values(&event_labels) - .map(|metric| metric.get() as u64) + .map(|metric| metric.get()) .unwrap_or(0); let storage_size = EVENTS_STORAGE_SIZE_DATE .get_metric_with_label_values(&storage_labels) - .map(|metric| metric.get() as u64) + .map(|metric| metric.get()) .unwrap_or(0); (events_ingested, ingestion_size, storage_size) diff --git a/src/metadata.rs b/src/metadata.rs index 1e7061bfb..34b5880b4 100644 --- a/src/metadata.rs +++ b/src/metadata.rs @@ -29,6 +29,7 @@ use crate::handlers::TelemetryType; use crate::metrics::{ EVENTS_INGESTED, EVENTS_INGESTED_DATE, EVENTS_INGESTED_SIZE, EVENTS_INGESTED_SIZE_DATE, EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_INGESTED, LIFETIME_EVENTS_INGESTED_SIZE, + TOTAL_EVENTS_INGESTED_DATE, TOTAL_EVENTS_INGESTED_SIZE_DATE, }; use crate::storage::StreamType; use crate::storage::retention::Retention; @@ -46,19 +47,25 @@ pub fn update_stats( .add(num_rows as i64); EVENTS_INGESTED_DATE .with_label_values(&[stream_name, origin, &parsed_date]) - .add(num_rows as i64); + .inc_by(num_rows as u64); EVENTS_INGESTED_SIZE .with_label_values(&[stream_name, origin]) .add(size as i64); EVENTS_INGESTED_SIZE_DATE .with_label_values(&[stream_name, origin, &parsed_date]) - .add(size as i64); + .inc_by(size); LIFETIME_EVENTS_INGESTED .with_label_values(&[stream_name, origin]) .add(num_rows as i64); LIFETIME_EVENTS_INGESTED_SIZE .with_label_values(&[stream_name, origin]) .add(size as i64); + TOTAL_EVENTS_INGESTED_DATE + .with_label_values(&[origin, &parsed_date]) + .inc_by(num_rows as u64); + TOTAL_EVENTS_INGESTED_SIZE_DATE + .with_label_values(&[origin, &parsed_date]) + .inc_by(size); } /// In order to support backward compatability with streams created before v1.6.4, @@ -173,12 +180,12 @@ pub fn load_daily_metrics(manifests: &Vec, stream_name: &str) { let storage_size = manifest.storage_size; EVENTS_INGESTED_DATE .with_label_values(&[stream_name, "json", &manifest_date]) - .set(events_ingested as i64); + .inc_by(events_ingested); EVENTS_INGESTED_SIZE_DATE .with_label_values(&[stream_name, "json", &manifest_date]) - .set(ingestion_size as i64); + .inc_by(ingestion_size); EVENTS_STORAGE_SIZE_DATE .with_label_values(&["data", stream_name, "parquet", &manifest_date]) - .set(storage_size as i64); + .inc_by(storage_size); } } diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index 1896bce0c..81809eefd 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -30,7 +30,7 @@ pub const METRICS_NAMESPACE: &str = env!("CARGO_PKG_NAME"); pub static EVENTS_INGESTED: Lazy = Lazy::new(|| { IntGaugeVec::new( - Opts::new("events_ingested", "Events ingested").namespace(METRICS_NAMESPACE), + Opts::new("events_ingested", "Events ingested for a stream").namespace(METRICS_NAMESPACE), &["stream", "format"], ) .expect("metric can be created") @@ -38,8 +38,11 @@ pub static EVENTS_INGESTED: Lazy = Lazy::new(|| { pub static EVENTS_INGESTED_SIZE: Lazy = Lazy::new(|| { IntGaugeVec::new( - Opts::new("events_ingested_size", "Events ingested size bytes") - .namespace(METRICS_NAMESPACE), + Opts::new( + "events_ingested_size", + "Events ingested size bytes for a stream", + ) + .namespace(METRICS_NAMESPACE), &["stream", "format"], ) .expect("metric can be created") @@ -47,7 +50,7 @@ pub static EVENTS_INGESTED_SIZE: Lazy = Lazy::new(|| { pub static STORAGE_SIZE: Lazy = Lazy::new(|| { IntGaugeVec::new( - Opts::new("storage_size", "Storage size bytes").namespace(METRICS_NAMESPACE), + Opts::new("storage_size", "Storage size bytes for a stream").namespace(METRICS_NAMESPACE), &["type", "stream", "format"], ) .expect("metric can be created") @@ -55,7 +58,7 @@ pub static STORAGE_SIZE: Lazy = Lazy::new(|| { pub static EVENTS_DELETED: Lazy = Lazy::new(|| { IntGaugeVec::new( - Opts::new("events_deleted", "Events deleted").namespace(METRICS_NAMESPACE), + Opts::new("events_deleted", "Events deleted for a stream").namespace(METRICS_NAMESPACE), &["stream", "format"], ) .expect("metric can be created") @@ -63,7 +66,11 @@ pub static EVENTS_DELETED: Lazy = Lazy::new(|| { pub static EVENTS_DELETED_SIZE: Lazy = Lazy::new(|| { IntGaugeVec::new( - Opts::new("events_deleted_size", "Events deleted size bytes").namespace(METRICS_NAMESPACE), + Opts::new( + "events_deleted_size", + "Events deleted size bytes for a stream", + ) + .namespace(METRICS_NAMESPACE), &["stream", "format"], ) .expect("metric can be created") @@ -73,7 +80,7 @@ pub static DELETED_EVENTS_STORAGE_SIZE: Lazy = Lazy::new(|| { IntGaugeVec::new( Opts::new( "deleted_events_storage_size", - "Deleted events storage size bytes", + "Deleted events storage size bytes for a stream", ) .namespace(METRICS_NAMESPACE), &["type", "stream", "format"], @@ -83,8 +90,11 @@ pub static DELETED_EVENTS_STORAGE_SIZE: Lazy = Lazy::new(|| { pub static LIFETIME_EVENTS_INGESTED: Lazy = Lazy::new(|| { IntGaugeVec::new( - Opts::new("lifetime_events_ingested", "Lifetime events ingested") - .namespace(METRICS_NAMESPACE), + Opts::new( + "lifetime_events_ingested", + "Lifetime events ingested for a stream", + ) + .namespace(METRICS_NAMESPACE), &["stream", "format"], ) .expect("metric can be created") @@ -94,7 +104,7 @@ pub static LIFETIME_EVENTS_INGESTED_SIZE: Lazy = Lazy::new(|| { IntGaugeVec::new( Opts::new( "lifetime_events_ingested_size", - "Lifetime events ingested size bytes", + "Lifetime events ingested size bytes for a stream", ) .namespace(METRICS_NAMESPACE), &["stream", "format"], @@ -106,7 +116,7 @@ pub static LIFETIME_EVENTS_STORAGE_SIZE: Lazy = Lazy::new(|| { IntGaugeVec::new( Opts::new( "lifetime_events_storage_size", - "Lifetime events storage size bytes", + "Lifetime events storage size bytes for a stream", ) .namespace(METRICS_NAMESPACE), &["type", "stream", "format"], @@ -114,11 +124,11 @@ pub static LIFETIME_EVENTS_STORAGE_SIZE: Lazy = Lazy::new(|| { .expect("metric can be created") }); -pub static EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( +pub static EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( Opts::new( "events_ingested_date", - "Events ingested on a particular date", + "Events ingested for a stream on a particular date", ) .namespace(METRICS_NAMESPACE), &["stream", "format", "date"], @@ -126,11 +136,11 @@ pub static EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { .expect("metric can be created") }); -pub static EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( +pub static EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( Opts::new( "events_ingested_size_date", - "Events ingested size in bytes on a particular date", + "Events ingested size in bytes for a stream on a particular date", ) .namespace(METRICS_NAMESPACE), &["stream", "format", "date"], @@ -138,11 +148,11 @@ pub static EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { .expect("metric can be created") }); -pub static EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { - IntGaugeVec::new( +pub static EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( Opts::new( "events_storage_size_date", - "Events storage size in bytes on a particular date", + "Events storage size in bytes for a stream on a particular date", ) .namespace(METRICS_NAMESPACE), &["type", "stream", "format", "date"], @@ -150,6 +160,42 @@ pub static EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { .expect("metric can be created") }); +pub static TOTAL_EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_events_ingested_date", + "total events ingested on a particular date", + ) + .namespace(METRICS_NAMESPACE), + &["format", "date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_events_ingested_size_date", + "Total events ingested size in bytes on a particular date", + ) + .namespace(METRICS_NAMESPACE), + &["format", "date"], + ) + .expect("metric can be created") +}); + +pub static TOTAL_EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { + IntCounterVec::new( + Opts::new( + "total_events_storage_size_date", + "Total events storage size in bytes on a particular date", + ) + .namespace(METRICS_NAMESPACE), + &["format", "date"], + ) + .expect("metric can be created") +}); + pub static STAGING_FILES: Lazy = Lazy::new(|| { IntGaugeVec::new( Opts::new("staging_files", "Active Staging files").namespace(METRICS_NAMESPACE), @@ -219,6 +265,15 @@ fn custom_metrics(registry: &Registry) { registry .register(Box::new(EVENTS_STORAGE_SIZE_DATE.clone())) .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_EVENTS_INGESTED_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_EVENTS_INGESTED_SIZE_DATE.clone())) + .expect("metric can be registered"); + registry + .register(Box::new(TOTAL_EVENTS_STORAGE_SIZE_DATE.clone())) + .expect("metric can be registered"); registry .register(Box::new(STAGING_FILES.clone())) .expect("metric can be registered"); diff --git a/src/query/listing_table_builder.rs b/src/query/listing_table_builder.rs index 9ca484ead..a2087d2cc 100644 --- a/src/query/listing_table_builder.rs +++ b/src/query/listing_table_builder.rs @@ -16,7 +16,7 @@ * */ -use std::{collections::HashMap, ops::Bound, pin::Pin, sync::Arc}; +use std::{ops::Bound, sync::Arc}; use arrow_schema::Schema; use datafusion::{ @@ -27,9 +27,7 @@ use datafusion::{ error::DataFusionError, logical_expr::col, }; -use futures_util::{Future, TryStreamExt, stream::FuturesUnordered}; use itertools::Itertools; -use object_store::{ObjectMeta, ObjectStore, path::Path}; use crate::{ OBJECT_STORE_DATA_GRANULARITY, event::DEFAULT_TIMESTAMP_KEY, storage::ObjectStorage, @@ -56,7 +54,6 @@ impl ListingTableBuilder { pub async fn populate_via_listing( self, storage: Arc, - client: Arc, time_filters: &[PartialTimeFilter], ) -> Result { // Extract the minimum start time from the time filters. @@ -90,67 +87,28 @@ impl ListingTableBuilder { let prefixes = TimeRange::new(start_time.and_utc(), end_time.and_utc()) .generate_prefixes(OBJECT_STORE_DATA_GRANULARITY); - // Categorizes prefixes into "minute" and general resolve lists. - let mut minute_resolve = HashMap::>::new(); - let mut all_resolve = Vec::new(); + // Build all prefixes as relative paths + let prefixes: Vec<_> = prefixes + .into_iter() + .map(|prefix| { + relative_path::RelativePathBuf::from(format!("{}/{}", &self.stream, prefix)) + }) + .collect(); + + // Use storage.list_dirs_relative for all prefixes and flatten results + let mut listing = Vec::new(); for prefix in prefixes { - let path = relative_path::RelativePathBuf::from(format!("{}/{}", &self.stream, prefix)); - let prefix = storage.absolute_url(path.as_relative_path()).to_string(); - if let Some(pos) = prefix.rfind("minute") { - let hour_prefix = &prefix[..pos]; - minute_resolve - .entry(hour_prefix.to_owned()) - .or_default() - .push(prefix); - } else { - all_resolve.push(prefix); + match storage.list_dirs_relative(&prefix).await { + Ok(paths) => { + listing.extend(paths.into_iter().map(|p| p.to_string())); + } + Err(e) => { + return Err(DataFusionError::External(Box::new(e))); + } } } - /// Resolve all prefixes asynchronously and collect the object metadata. - type ResolveFuture = - Pin, object_store::Error>> + Send>>; - let tasks: FuturesUnordered = FuturesUnordered::new(); - for (listing_prefix, prefixes) in minute_resolve { - let client = Arc::clone(&client); - tasks.push(Box::pin(async move { - let path = Path::from(listing_prefix); - let mut objects = client.list(Some(&path)).try_collect::>().await?; - - objects.retain(|obj| { - prefixes.iter().any(|prefix| { - obj.location - .prefix_matches(&object_store::path::Path::from(prefix.as_ref())) - }) - }); - - Ok(objects) - })); - } - - for prefix in all_resolve { - let client = Arc::clone(&client); - tasks.push(Box::pin(async move { - client - .list(Some(&object_store::path::Path::from(prefix))) - .try_collect::>() - .await - })); - } - - let listing = tasks - .try_collect::>>() - .await - .map_err(|err| DataFusionError::External(Box::new(err)))? - .into_iter() - .flat_map(|res| { - res.into_iter() - .map(|obj| obj.location.to_string()) - .collect::>() - }) - .sorted() - .rev() - .collect_vec(); + let listing = listing.into_iter().sorted().rev().collect_vec(); Ok(Self { stream: self.stream, diff --git a/src/query/mod.rs b/src/query/mod.rs index 6c142c6a6..07b3547aa 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -514,13 +514,6 @@ pub async fn get_manifest_list( ) -> Result, QueryError> { let glob_storage = PARSEABLE.storage.get_object_store(); - let object_store = QUERY_SESSION - .state() - .runtime_env() - .object_store_registry - .get_store(&glob_storage.store_url()) - .unwrap(); - // get object store let object_store_format = glob_storage .get_object_store_format(stream_name) @@ -560,7 +553,7 @@ pub async fn get_manifest_list( ]; let all_manifest_files = collect_manifest_files( - object_store, + glob_storage, merged_snapshot .manifests(&time_filter) .into_iter() diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index 8765650e6..efe0fad7f 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -16,11 +16,10 @@ * */ -use std::{any::Any, collections::HashMap, ops::Bound, sync::Arc}; +use std::{any::Any, collections::HashMap, ops::Bound, path::PathBuf, sync::Arc}; use arrow_array::RecordBatch; use arrow_schema::{Schema, SchemaRef, SortOptions}; -use bytes::Bytes; use chrono::{DateTime, NaiveDateTime, TimeDelta, Timelike, Utc}; use datafusion::{ catalog::{SchemaProvider, Session}, @@ -45,11 +44,9 @@ use datafusion::{ prelude::Expr, scalar::ScalarValue, }; -use futures_util::{StreamExt, TryFutureExt, TryStreamExt, stream::FuturesOrdered}; +use futures_util::TryFutureExt; use itertools::Itertools; -use object_store::{ObjectStore, path::Path}; use relative_path::RelativePathBuf; -use url::Url; use crate::{ catalog::{ @@ -60,10 +57,10 @@ use crate::{ }, event::DEFAULT_TIMESTAMP_KEY, hottier::HotTierManager, - metrics::QUERY_CACHE_HIT, + metrics::{QUERY_CACHE_HIT, storage::STORAGE_FILES_SCANNED}, option::Mode, parseable::{PARSEABLE, STREAM_EXISTS}, - storage::{ObjectStorage, ObjectStoreFormat, STREAM_ROOT_DIRECTORY}, + storage::{ObjectStorage, ObjectStorageError, ObjectStoreFormat, STREAM_ROOT_DIRECTORY}, }; use super::listing_table_builder::ListingTableBuilder; @@ -92,7 +89,6 @@ impl SchemaProvider for GlobalSchemaProvider { .expect(STREAM_EXISTS) .get_schema(), stream: name.to_owned(), - url: self.storage.store_url(), }))) } else { Ok(None) @@ -109,8 +105,6 @@ struct StandardTableProvider { schema: SchemaRef, // prefix under which to find snapshot stream: String, - // url to find right instance of object store - url: Url, } impl StandardTableProvider { @@ -277,7 +271,6 @@ impl StandardTableProvider { &self, execution_plans: &mut Vec>, glob_storage: Arc, - object_store: Arc, time_filters: &[PartialTimeFilter], state: &dyn Session, projection: Option<&Vec>, @@ -286,7 +279,7 @@ impl StandardTableProvider { time_partition: Option, ) -> Result<(), DataFusionError> { ListingTableBuilder::new(self.stream.to_owned()) - .populate_via_listing(glob_storage.clone(), object_store, time_filters) + .populate_via_listing(glob_storage.clone(), time_filters) .and_then(|builder| async { let table = builder.build( self.schema.clone(), @@ -328,7 +321,7 @@ impl StandardTableProvider { &self, manifest_files: Vec, ) -> (Vec>, datafusion::common::Statistics) { - let target_partition = num_cpus::get(); + let target_partition: usize = num_cpus::get(); let mut partitioned_files = Vec::from_iter((0..target_partition).map(|_| Vec::new())); let mut column_statistics = HashMap::>::new(); let mut count = 0; @@ -408,20 +401,21 @@ impl StandardTableProvider { async fn collect_from_snapshot( snapshot: &Snapshot, time_filters: &[PartialTimeFilter], - object_store: Arc, + storage: Arc, filters: &[Expr], limit: Option, ) -> Result, DataFusionError> { let items = snapshot.manifests(time_filters); let manifest_files = collect_manifest_files( - object_store, + storage, items .into_iter() .sorted_by_key(|file| file.time_lower_bound) .map(|item| item.manifest_path) .collect(), ) - .await?; + .await + .map_err(|err| DataFusionError::External(Box::new(err)))?; let mut manifest_files: Vec<_> = manifest_files .into_iter() @@ -474,14 +468,8 @@ impl TableProvider for StandardTableProvider { limit: Option, ) -> Result, DataFusionError> { let mut execution_plans = vec![]; - let object_store = state - .runtime_env() - .object_store_registry - .get_store(&self.url) - .unwrap(); - let glob_storage = PARSEABLE.storage.get_object_store(); - - let object_store_format = glob_storage + let storage = PARSEABLE.storage().get_object_store(); + let object_store_format = storage .get_object_store_format(&self.stream) .await .map_err(|err| DataFusionError::Plan(err.to_string()))?; @@ -501,7 +489,7 @@ impl TableProvider for StandardTableProvider { let mut merged_snapshot = Snapshot::default(); if PARSEABLE.options.mode == Mode::Query || PARSEABLE.options.mode == Mode::Prism { let path = RelativePathBuf::from_iter([&self.stream, STREAM_ROOT_DIRECTORY]); - let obs = glob_storage + let obs = storage .get_objects( Some(&path), Box::new(|file_name| file_name.ends_with("stream.json")), @@ -532,8 +520,7 @@ impl TableProvider for StandardTableProvider { if let Some(listing_time_filter) = listing_time_fiters { self.legacy_listing_table( &mut execution_plans, - glob_storage.clone(), - object_store.clone(), + storage.clone(), &listing_time_filter, state, projection, @@ -548,7 +535,7 @@ impl TableProvider for StandardTableProvider { let mut manifest_files = collect_from_snapshot( &merged_snapshot, &time_filters, - object_store, + storage.clone(), filters, limit, ) @@ -579,10 +566,15 @@ impl TableProvider for StandardTableProvider { return self.final_plan(execution_plans, projection); } + let parquet_files_to_scan = manifest_files.len(); + STORAGE_FILES_SCANNED + .with_label_values(&[PARSEABLE.storage().name(), "GET"]) + .inc_by(parquet_files_to_scan as f64); + let (partitioned_files, statistics) = self.partitioned_files(manifest_files); self.create_parquet_physical_plan( &mut execution_plans, - ObjectStoreUrl::parse(glob_storage.store_url()).unwrap(), + ObjectStoreUrl::parse(storage.store_url()).unwrap(), partitioned_files, statistics, projection, @@ -849,24 +841,27 @@ fn extract_timestamp_bound( } pub async fn collect_manifest_files( - storage: Arc, + storage: Arc, manifest_urls: Vec, -) -> Result, object_store::Error> { - let tasks = manifest_urls.into_iter().map(|path| { - let path = Path::parse(path).unwrap(); +) -> Result, ObjectStorageError> { + let mut tasks = Vec::new(); + manifest_urls.into_iter().for_each(|path| { + let path = RelativePathBuf::from_path(PathBuf::from(path)).expect("Invalid path"); let storage = Arc::clone(&storage); - async move { storage.get(&path).await } + tasks.push(tokio::task::spawn(async move { + storage.get_object(&path).await + })); }); - let resp = FuturesOrdered::from_iter(tasks) - .and_then(|res| res.bytes()) - .collect::>>() - .await; + let mut op = Vec::new(); + for task in tasks { + let file = task.await??; + op.push(file); + } - Ok(resp + Ok(op .into_iter() - .flat_map(|res| res.ok()) - .map(|bytes| serde_json::from_slice(&bytes).unwrap()) + .map(|res| serde_json::from_slice(&res).expect("Data is invalid for Manifest")) .collect()) } diff --git a/src/stats.rs b/src/stats.rs index 5a167cc39..0c2214043 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -20,16 +20,17 @@ use std::collections::HashMap; use std::sync::Arc; use once_cell::sync::Lazy; -use prometheus::IntGaugeVec; use prometheus::core::Collector; use prometheus::proto::MetricFamily; +use prometheus::{IntCounterVec, IntGaugeVec}; use tracing::warn; use crate::metrics::{ DELETED_EVENTS_STORAGE_SIZE, EVENTS_DELETED, EVENTS_DELETED_SIZE, EVENTS_INGESTED, EVENTS_INGESTED_DATE, EVENTS_INGESTED_SIZE, EVENTS_INGESTED_SIZE_DATE, EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_INGESTED, LIFETIME_EVENTS_INGESTED_SIZE, - LIFETIME_EVENTS_STORAGE_SIZE, STORAGE_SIZE, + LIFETIME_EVENTS_STORAGE_SIZE, STORAGE_SIZE, TOTAL_EVENTS_INGESTED_DATE, + TOTAL_EVENTS_INGESTED_SIZE_DATE, TOTAL_EVENTS_STORAGE_SIZE_DATE, }; use crate::storage::{ObjectStorage, ObjectStorageError, ObjectStoreFormat}; @@ -136,6 +137,10 @@ pub async fn update_deleted_stats( "parquet", &manifest_date, ]); + let _ = TOTAL_EVENTS_INGESTED_DATE.remove_label_values(&["json", &manifest_date]); + let _ = TOTAL_EVENTS_INGESTED_SIZE_DATE.remove_label_values(&["json", &manifest_date]); + let _ = + TOTAL_EVENTS_STORAGE_SIZE_DATE.remove_label_values(&["parquet", &manifest_date]); num_row += manifest.events_ingested as i64; ingestion_size += manifest.ingestion_size as i64; storage_size += manifest.storage_size as i64; @@ -197,7 +202,7 @@ fn remove_label_values(lazy_static: &Lazy, event_labels: &[&str]) { } } -fn delete_with_label_prefix(metrics: &IntGaugeVec, prefix: &[&str]) { +fn delete_with_label_prefix(metrics: &IntCounterVec, prefix: &[&str]) { let families: Vec = metrics.collect().into_iter().collect(); for metric in families.iter().flat_map(|m| m.get_metric()) { let label_map: HashMap<&str, &str> = metric diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index fcf090126..2956dfe91 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -19,7 +19,10 @@ use std::{ collections::{BTreeMap, HashSet}, path::Path, - sync::Arc, + sync::{ + Arc, + atomic::{AtomicU64, Ordering}, + }, time::{Duration, Instant}, }; @@ -210,13 +213,15 @@ pub struct BlobStore { impl BlobStore { async fn _get_object(&self, path: &RelativePath) -> Result { - let instant = Instant::now(); + let time = std::time::Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; - let elapsed = instant.elapsed().as_secs_f64(); - + let elapsed = time.elapsed().as_secs_f64(); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "GET"]) + .inc(); match resp { Ok(resp) => { - let body = resp.bytes().await.unwrap(); + let body: Bytes = resp.bytes().await.unwrap(); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "GET", "200"]) .observe(elapsed); @@ -237,9 +242,12 @@ impl BlobStore { path: &RelativePath, resource: PutPayload, ) -> Result<(), ObjectStorageError> { - let instant = Instant::now(); + let time = std::time::Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let elapsed = instant.elapsed().as_secs_f64(); + let elapsed = time.elapsed().as_secs_f64(); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "PUT"]) + .inc(); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -252,18 +260,14 @@ impl BlobStore { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "PUT", status_code]) .observe(elapsed); - - if let object_store::Error::NotFound { source, .. } = &err { - return Err(ObjectStorageError::Custom( - format!("Failed to upload, error: {source:?}").to_string(), - )); - } Err(err.into()) } } } async fn _delete_prefix(&self, key: &str) -> Result<(), ObjectStorageError> { + let files_scanned = Arc::new(AtomicU64::new(0)); + let files_deleted = Arc::new(AtomicU64::new(0)); // Track LIST operation let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); @@ -274,19 +278,20 @@ impl BlobStore { object_stream .for_each_concurrent(None, |x| async { + files_scanned.fetch_add(1, Ordering::Relaxed); match x { Ok(obj) => { - // Track individual DELETE operation + files_deleted.fetch_add(1, Ordering::Relaxed); let delete_start = Instant::now(); - match self.client.delete(&obj.location).await { + let delete_resp = self.client.delete(&obj.location).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + match delete_resp { Ok(_) => { - let delete_elapsed = delete_start.elapsed().as_secs_f64(); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "DELETE", "200"]) .observe(delete_elapsed); } Err(err) => { - let delete_elapsed = delete_start.elapsed().as_secs_f64(); let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "DELETE", status_code]) @@ -302,32 +307,26 @@ impl BlobStore { }) .await; + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(files_scanned.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "DELETE"]) + .inc_by(files_deleted.load(Ordering::Relaxed) as f64); Ok(()) } async fn _list_streams(&self) -> Result, ObjectStorageError> { let mut result_file_list = HashSet::new(); + let mut total_files_scanned = 0u64; - // Track initial LIST operation let list_start = Instant::now(); - let resp = self.client.list_with_delimiter(None).await; + let resp = self.client.list_with_delimiter(None).await?; let list_elapsed = list_start.elapsed().as_secs_f64(); - - let resp = match resp { - Ok(resp) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); - resp - } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", status_code]) - .observe(list_elapsed); - return Err(err.into()); - } - }; + total_files_scanned += resp.objects.len() as u64; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); let streams = resp .common_prefixes @@ -346,12 +345,13 @@ impl BlobStore { let resp = self.client.list_with_delimiter(Some(&stream_path)).await; let stream_list_elapsed = stream_list_start.elapsed().as_secs_f64(); - match resp { + match &resp { Ok(resp) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(stream_list_elapsed); + total_files_scanned += resp.objects.len() as u64; if resp .objects .iter() @@ -361,22 +361,28 @@ impl BlobStore { } } Err(err) => { - let status_code = error_to_status_code(&err); + let status_code = error_to_status_code(err); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", status_code]) .observe(stream_list_elapsed); - return Err(err.into()); + return Err(ObjectStorageError::UnhandledError(Box::new( + std::io::Error::other(format!("List operation failed: {}", err)), + ))); } } } + // Record total files scanned across all operations + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(total_files_scanned as f64); + Ok(result_file_list) } async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { - // Track LIST operation let list_start = Instant::now(); - let resp = self + let resp: Result = self .client .list_with_delimiter(Some(&(stream.into()))) .await; @@ -400,6 +406,10 @@ impl BlobStore { let common_prefixes = resp.common_prefixes; + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(common_prefixes.len() as f64); + // return prefixes at the root level let dates: Vec<_> = common_prefixes .iter() @@ -415,6 +425,7 @@ impl BlobStore { stream: &str, ) -> Result>, ObjectStorageError> { let mut result_file_list: BTreeMap> = BTreeMap::new(); + let mut total_files_scanned = 0u64; // Track initial LIST operation let list_start = Instant::now(); @@ -426,6 +437,7 @@ impl BlobStore { let resp = match resp { Ok(resp) => { + total_files_scanned += resp.objects.len() as u64; STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); @@ -462,6 +474,7 @@ impl BlobStore { .with_label_values(&["azure_blob", "LIST", "200"]) .observe(date_list_elapsed); + total_files_scanned += resp.objects.len() as u64; let manifests: Vec = resp .objects .iter() @@ -479,40 +492,38 @@ impl BlobStore { } } } - Ok(result_file_list) - } - async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - // // TODO: Uncomment this when multipart is fixed - // let should_multipart = std::fs::metadata(path)?.len() > MULTIPART_UPLOAD_SIZE as u64; - let should_multipart = false; + // Record total files scanned across all date operations + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(total_files_scanned as f64); - if should_multipart { - // self._upload_multipart(key, path).await - // this branch will never get executed - Ok(()) - } else { - let bytes = tokio::fs::read(path).await?; + Ok(result_file_list) + } - let put_start = Instant::now(); - let result = self.client.put(&key.into(), bytes.into()).await; - let put_elapsed = put_start.elapsed().as_secs_f64(); + async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { + let bytes = tokio::fs::read(path).await?; - match result { - Ok(result) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT", "200"]) - .observe(put_elapsed); - info!("Uploaded file to Azure Blob Storage: {:?}", result); - Ok(()) - } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "PUT", status_code]) - .observe(put_elapsed); - Err(err.into()) - } + let put_start = Instant::now(); + let result = self.client.put(&key.into(), bytes.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "PUT"]) + .inc(); + match result { + Ok(result) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", "200"]) + .observe(put_elapsed); + info!("Uploaded file to Azure Blob Storage: {:?}", result); + Ok(()) + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "PUT", status_code]) + .observe(put_elapsed); + Err(err.into()) } } } @@ -689,13 +700,6 @@ impl BlobStore { #[async_trait] impl ObjectStorage for BlobStore { - async fn upload_multipart( - &self, - key: &RelativePath, - path: &Path, - ) -> Result<(), ObjectStorageError> { - self._upload_multipart(key, path).await - } async fn get_buffered_reader( &self, _path: &RelativePath, @@ -707,29 +711,42 @@ impl ObjectStorage for BlobStore { ), ))) } - async fn head(&self, _path: &RelativePath) -> Result { - // Record attempt to access file (even though operation not implemented) - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "HEAD"]) - .inc(); - Err(ObjectStorageError::UnhandledError(Box::new( - std::io::Error::new( - std::io::ErrorKind::Unsupported, - "Head operation not implemented for Blob Storage yet", - ), - ))) + async fn upload_multipart( + &self, + key: &RelativePath, + path: &Path, + ) -> Result<(), ObjectStorageError> { + self._upload_multipart(key, path).await } - async fn get_object(&self, path: &RelativePath) -> Result { - let result = self._get_object(path).await?; + async fn head(&self, path: &RelativePath) -> Result { + let head_start = Instant::now(); + let result = self.client.head(&to_object_store_path(path)).await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "HEAD", "200"]) + .observe(head_elapsed); + // Record single file accessed + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "HEAD"]) + .inc(); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "HEAD", status_code]) + .observe(head_elapsed); + } + } - // Record single file accessed - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "GET"]) - .inc(); + Ok(result?) + } - Ok(result) + async fn get_object(&self, path: &RelativePath) -> Result { + Ok(self._get_object(path).await?) } async fn get_objects( @@ -750,21 +767,11 @@ impl ObjectStorage for BlobStore { let mut res = vec![]; let mut files_scanned = 0; + // Note: We track each streaming list item retrieval while let Some(meta_result) = list_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -782,13 +789,23 @@ impl ObjectStorage for BlobStore { .map_err(ObjectStorageError::PathError)?, ) .await?; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "GET", "200"]) + .observe(list_start.elapsed().as_secs_f64()); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "GET"]) + .inc(); res.push(byts); } + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); // Record total files scanned STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "GET"]) + .with_label_values(&["azure_blob", "LIST"]) .inc_by(files_scanned as f64); Ok(res) @@ -805,20 +822,9 @@ impl ObjectStorage for BlobStore { let mut object_stream = self.client.list(Some(&self.root)); while let Some(meta_result) = object_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -830,7 +836,10 @@ impl ObjectStorage for BlobStore { path_arr.push(RelativePathBuf::from(meta.location.as_ref())); } } - + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); // Record total files scanned STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "LIST"]) @@ -852,20 +861,9 @@ impl ObjectStorage for BlobStore { let mut object_stream = self.client.list(Some(&path)); while let Some(meta_result) = object_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["azure_blob", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -877,6 +875,10 @@ impl ObjectStorage for BlobStore { path_arr.push(RelativePathBuf::from(meta.location.as_ref())); } } + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); path_arr.push(RelativePathBuf::from_iter([ stream_name, @@ -901,11 +903,6 @@ impl ObjectStorage for BlobStore { .await .map_err(|err| ObjectStorageError::ConnectionError(Box::new(err)))?; - // Record single file written - STORAGE_FILES_SCANNED - .with_label_values(&["azure_blob", "PUT"]) - .inc(); - Ok(()) } @@ -962,6 +959,9 @@ impl ObjectStorage for BlobStore { .observe(head_elapsed); } } + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "HEAD"]) + .inc(); Ok(result.map(|_| ())?) } @@ -978,7 +978,9 @@ impl ObjectStorage for BlobStore { let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(&file)).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "DELETE"]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -992,15 +994,7 @@ impl ObjectStorage for BlobStore { .with_label_values(&["azure_blob", "DELETE", status_code]) .observe(delete_elapsed); - // if the object is not found, it is not an error - // the given url path was incorrect - if matches!(err, object_store::Error::NotFound { .. }) { - error!("Node does not exist"); - Err(err.into()) - } else { - error!("Error deleting node meta file: {:?}", err); - Err(err.into()) - } + Err(err.into()) } } } @@ -1058,7 +1052,7 @@ impl ObjectStorage for BlobStore { stream_json_check.try_collect::<()>().await?; - Ok(dirs.into_iter().collect()) + Ok(dirs) } async fn list_dates(&self, stream_name: &str) -> Result, ObjectStorageError> { @@ -1073,9 +1067,14 @@ impl ObjectStorage for BlobStore { date: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/", stream_name, date)); + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); - let hours = resp + let hours: Vec = resp .common_prefixes .iter() .filter_map(|path| { @@ -1102,9 +1101,14 @@ impl ObjectStorage for BlobStore { hour: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/{}/", stream_name, date, hour)); + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["azure_blob", "LIST", "200"]) + .observe(list_elapsed); - let minutes = resp + let minutes: Vec = resp .common_prefixes .iter() .filter_map(|path| { @@ -1135,9 +1139,7 @@ impl ObjectStorage for BlobStore { } async fn upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - self._upload_file(key, path).await?; - - Ok(()) + Ok(self._upload_file(key, path).await?) } fn absolute_url(&self, prefix: &RelativePath) -> object_store::path::Path { diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index 177f8f57d..9af321533 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -19,7 +19,10 @@ use std::{ collections::{BTreeMap, HashSet}, path::Path, - sync::Arc, + sync::{ + Arc, + atomic::{AtomicU64, Ordering}, + }, time::{Duration, Instant}, }; @@ -175,23 +178,25 @@ pub struct Gcs { impl Gcs { async fn _get_object(&self, path: &RelativePath) -> Result { - let get_start = Instant::now(); + let time = std::time::Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; - let get_elapsed = get_start.elapsed().as_secs_f64(); - + let elapsed = time.elapsed().as_secs_f64(); + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "GET"]) + .inc(); match resp { Ok(resp) => { - let body = resp.bytes().await.unwrap(); + let body: Bytes = resp.bytes().await.unwrap(); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "GET", "200"]) - .observe(get_elapsed); + .observe(elapsed); Ok(body) } Err(err) => { let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "GET", status_code]) - .observe(get_elapsed); + .observe(elapsed); Err(err.into()) } } @@ -202,40 +207,32 @@ impl Gcs { path: &RelativePath, resource: PutPayload, ) -> Result<(), ObjectStorageError> { - let put_start = Instant::now(); + let time = std::time::Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let put_elapsed = put_start.elapsed().as_secs_f64(); - - if let Err(object_store::Error::NotFound { source, .. }) = &resp { - let source_str = source.to_string(); - if source_str.contains("NoSuchBucket") { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "PUT", "404"]) - .observe(put_elapsed); - return Err(ObjectStorageError::Custom( - format!("Bucket '{}' does not exist in GCS.", self.bucket).to_string(), - )); - } - } - + let elapsed = time.elapsed().as_secs_f64(); + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "PUT"]) + .inc(); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "PUT", "200"]) - .observe(put_elapsed); + .observe(elapsed); Ok(()) } Err(err) => { let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "PUT", status_code]) - .observe(put_elapsed); + .observe(elapsed); Err(err.into()) } } } async fn _delete_prefix(&self, key: &str) -> Result<(), ObjectStorageError> { + let files_scanned = Arc::new(AtomicU64::new(0)); + let files_deleted = Arc::new(AtomicU64::new(0)); // Track LIST operation let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); @@ -246,19 +243,20 @@ impl Gcs { object_stream .for_each_concurrent(None, |x| async { + files_scanned.fetch_add(1, Ordering::Relaxed); match x { Ok(obj) => { - // Track individual DELETE operation + files_deleted.fetch_add(1, Ordering::Relaxed); let delete_start = Instant::now(); - match self.client.delete(&obj.location).await { + let delete_resp = self.client.delete(&obj.location).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + match delete_resp { Ok(_) => { - let delete_elapsed = delete_start.elapsed().as_secs_f64(); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "DELETE", "200"]) .observe(delete_elapsed); } Err(err) => { - let delete_elapsed = delete_start.elapsed().as_secs_f64(); let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "DELETE", status_code]) @@ -274,32 +272,26 @@ impl Gcs { }) .await; + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(files_scanned.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "DELETE"]) + .inc_by(files_deleted.load(Ordering::Relaxed) as f64); Ok(()) } async fn _list_streams(&self) -> Result, ObjectStorageError> { let mut result_file_list = HashSet::new(); + let mut total_files_scanned = 0u64; - // Track initial LIST operation let list_start = Instant::now(); - let resp = self.client.list_with_delimiter(None).await; + let resp = self.client.list_with_delimiter(None).await?; let list_elapsed = list_start.elapsed().as_secs_f64(); - - let resp = match resp { - Ok(resp) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); - resp - } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", status_code]) - .observe(list_elapsed); - return Err(err.into()); - } - }; + total_files_scanned += resp.objects.len() as u64; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); let streams = resp .common_prefixes @@ -324,6 +316,7 @@ impl Gcs { .with_label_values(&["gcs", "LIST", "200"]) .observe(stream_list_elapsed); + total_files_scanned += resp.objects.len() as u64; if resp .objects .iter() @@ -344,13 +337,17 @@ impl Gcs { } } + // Record total files scanned across all operations + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(total_files_scanned as f64); + Ok(result_file_list) } async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { - // Track LIST operation let list_start = Instant::now(); - let resp = self + let resp: Result = self .client .list_with_delimiter(Some(&(stream.into()))) .await; @@ -374,6 +371,10 @@ impl Gcs { let common_prefixes = resp.common_prefixes; + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(common_prefixes.len() as f64); + // return prefixes at the root level let dates: Vec<_> = common_prefixes .iter() @@ -389,6 +390,7 @@ impl Gcs { stream: &str, ) -> Result>, ObjectStorageError> { let mut result_file_list: BTreeMap> = BTreeMap::new(); + let mut total_files_scanned = 0u64; // Track initial LIST operation let list_start = Instant::now(); @@ -400,6 +402,7 @@ impl Gcs { let resp = match resp { Ok(resp) => { + total_files_scanned += resp.objects.len() as u64; STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); @@ -436,6 +439,7 @@ impl Gcs { .with_label_values(&["gcs", "LIST", "200"]) .observe(date_list_elapsed); + total_files_scanned += resp.objects.len() as u64; let manifests: Vec = resp .objects .iter() @@ -453,15 +457,24 @@ impl Gcs { } } } + + // Record total files scanned across all date operations + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(total_files_scanned as f64); + Ok(result_file_list) } + async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { let bytes = tokio::fs::read(path).await?; let put_start = Instant::now(); let result = self.client.put(&key.into(), bytes.into()).await; let put_elapsed = put_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "PUT"]) + .inc(); match result { Ok(result) => { STORAGE_REQUEST_RESPONSE_TIME @@ -616,7 +629,9 @@ impl ObjectStorage for Gcs { let head_start = Instant::now(); let meta = self.client.head(path).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "HEAD"]) + .inc(); let meta = match meta { Ok(meta) => { STORAGE_REQUEST_RESPONSE_TIME @@ -637,6 +652,7 @@ impl ObjectStorage for Gcs { let buf = object_store::buffered::BufReader::new(store, &meta); Ok(buf) } + async fn upload_multipart( &self, key: &RelativePath, @@ -644,11 +660,14 @@ impl ObjectStorage for Gcs { ) -> Result<(), ObjectStorageError> { self._upload_multipart(key, path).await } + async fn head(&self, path: &RelativePath) -> Result { let head_start = Instant::now(); let result = self.client.head(&to_object_store_path(path)).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "HEAD"]) + .inc(); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -671,14 +690,7 @@ impl ObjectStorage for Gcs { } async fn get_object(&self, path: &RelativePath) -> Result { - let result = self._get_object(path).await?; - - // Record single file accessed - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "GET"]) - .inc(); - - Ok(result) + Ok(self._get_object(path).await?) } async fn get_objects( @@ -701,20 +713,9 @@ impl ObjectStorage for Gcs { // Note: We track each streaming list item retrieval while let Some(meta_result) = list_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -732,13 +733,23 @@ impl ObjectStorage for Gcs { .map_err(ObjectStorageError::PathError)?, ) .await?; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "GET", "200"]) + .observe(list_start.elapsed().as_secs_f64()); + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "GET"]) + .inc(); res.push(byts); } + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); // Record total files scanned STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "GET"]) + .with_label_values(&["gcs", "LIST"]) .inc_by(files_scanned as f64); Ok(res) @@ -755,20 +766,9 @@ impl ObjectStorage for Gcs { let mut object_stream = self.client.list(Some(&self.root)); while let Some(meta_result) = object_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -780,7 +780,10 @@ impl ObjectStorage for Gcs { path_arr.push(RelativePathBuf::from(meta.location.as_ref())); } } - + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); // Record total files scanned STORAGE_FILES_SCANNED .with_label_values(&["gcs", "LIST"]) @@ -802,20 +805,9 @@ impl ObjectStorage for Gcs { let mut object_stream = self.client.list(Some(&path)); while let Some(meta_result) = object_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["gcs", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -827,6 +819,10 @@ impl ObjectStorage for Gcs { path_arr.push(RelativePathBuf::from(meta.location.as_ref())); } } + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); path_arr.push(RelativePathBuf::from_iter([ stream_name, @@ -851,11 +847,6 @@ impl ObjectStorage for Gcs { .await .map_err(|err| ObjectStorageError::ConnectionError(Box::new(err)))?; - // Record single file written - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "PUT"]) - .inc(); - Ok(()) } @@ -866,22 +857,57 @@ impl ObjectStorage for Gcs { } async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { - let result = self.client.delete(&to_object_store_path(path)).await?; + let delete_start = Instant::now(); + let result = self.client.delete(&to_object_store_path(path)).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); - // Record single file deleted - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "DELETE"]) - .inc(); + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "DELETE", "200"]) + .observe(delete_elapsed); + // Record single file deleted + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "DELETE"]) + .inc(); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "DELETE", status_code]) + .observe(delete_elapsed); + } + } - Ok(result) + Ok(result?) } async fn check(&self) -> Result<(), ObjectStorageError> { - Ok(self + let head_start = Instant::now(); + let result = self .client .head(&to_object_store_path(&parseable_json_path())) - .await - .map(|_| ())?) + .await; + let head_elapsed = head_start.elapsed().as_secs_f64(); + + match &result { + Ok(_) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", "200"]) + .observe(head_elapsed); + } + Err(err) => { + let status_code = error_to_status_code(err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "HEAD", status_code]) + .observe(head_elapsed); + } + } + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "HEAD"]) + .inc(); + + Ok(result.map(|_| ())?) } async fn delete_stream(&self, stream_name: &str) -> Result<(), ObjectStorageError> { @@ -896,7 +922,9 @@ impl ObjectStorage for Gcs { let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(&file)).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "DELETE"]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -910,15 +938,7 @@ impl ObjectStorage for Gcs { .with_label_values(&["gcs", "DELETE", status_code]) .observe(delete_elapsed); - // if the object is not found, it is not an error - // the given url path was incorrect - if matches!(err, object_store::Error::NotFound { .. }) { - error!("Node does not exist"); - Err(err.into()) - } else { - error!("Error deleting node meta file: {:?}", err); - Err(err.into()) - } + Err(err.into()) } } } @@ -991,9 +1011,14 @@ impl ObjectStorage for Gcs { date: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/", stream_name, date)); + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); - let hours = resp + let hours: Vec = resp .common_prefixes .iter() .filter_map(|path| { @@ -1020,9 +1045,14 @@ impl ObjectStorage for Gcs { hour: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/{}/", stream_name, date, hour)); + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["gcs", "LIST", "200"]) + .observe(list_elapsed); - let minutes = resp + let minutes: Vec = resp .common_prefixes .iter() .filter_map(|path| { @@ -1053,9 +1083,7 @@ impl ObjectStorage for Gcs { } async fn upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - self._upload_file(key, path).await?; - - Ok(()) + Ok(self._upload_file(key, path).await?) } fn absolute_url(&self, prefix: &RelativePath) -> object_store::path::Path { diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index a1e987068..ba4675b0e 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -54,6 +54,7 @@ use crate::handlers::http::modal::ingest_server::INGESTOR_EXPECT; use crate::handlers::http::modal::ingest_server::INGESTOR_META; use crate::handlers::http::users::CORRELATION_DIR; use crate::handlers::http::users::{DASHBOARDS_DIR, FILTER_DIR, USERS_ROOT_DIR}; +use crate::metrics::TOTAL_EVENTS_STORAGE_SIZE_DATE; use crate::metrics::storage::StorageMetrics; use crate::metrics::{EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_STORAGE_SIZE, STORAGE_SIZE}; use crate::option::Mode; @@ -146,17 +147,22 @@ fn update_storage_metrics( ) -> Result<(), ObjectStorageError> { let mut file_date_part = filename.split('.').collect::>()[0]; file_date_part = file_date_part.split('=').collect::>()[1]; - let compressed_size = path.metadata().map_or(0, |meta| meta.len()); - + let compressed_size = path + .metadata() + .map(|m| m.len()) + .map_err(|e| ObjectStorageError::Custom(format!("metadata failed for {filename}: {e}")))?; STORAGE_SIZE .with_label_values(&["data", stream_name, "parquet"]) .add(compressed_size as i64); EVENTS_STORAGE_SIZE_DATE .with_label_values(&["data", stream_name, "parquet", file_date_part]) - .add(compressed_size as i64); + .inc_by(compressed_size); LIFETIME_EVENTS_STORAGE_SIZE .with_label_values(&["data", stream_name, "parquet"]) .add(compressed_size as i64); + TOTAL_EVENTS_STORAGE_SIZE_DATE + .with_label_values(&["parquet", file_date_part]) + .inc_by(compressed_size); Ok(()) } diff --git a/src/storage/s3.rs b/src/storage/s3.rs index 9e28c46b1..59a0efe0e 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -342,12 +342,13 @@ impl S3 { async fn _get_object(&self, path: &RelativePath) -> Result { let time = std::time::Instant::now(); let resp = self.client.get(&to_object_store_path(path)).await; - let elapsed = time.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "GET"]) + .inc(); match resp { Ok(resp) => { - let body = resp.bytes().await.unwrap(); + let body: Bytes = resp.bytes().await.unwrap(); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "GET", "200"]) .observe(elapsed); @@ -370,9 +371,10 @@ impl S3 { ) -> Result<(), ObjectStorageError> { let time = std::time::Instant::now(); let resp = self.client.put(&to_object_store_path(path), resource).await; - let elapsed = time.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "PUT"]) + .inc(); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -381,23 +383,7 @@ impl S3 { Ok(()) } Err(err) => { - let status_code = match &err { - object_store::Error::NotFound { .. } => { - // Check for specific S3 bucket not found error - let source_str = err.to_string(); - if source_str.contains("NoSuchBucket") { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT", "404"]) - .observe(elapsed); - return Err(ObjectStorageError::Custom( - format!("Bucket '{}' does not exist in S3.", self.bucket) - .to_string(), - )); - } - "404" - } - _ => error_to_status_code(&err), - }; + let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "PUT", status_code]) .observe(elapsed); @@ -408,7 +394,7 @@ impl S3 { async fn _delete_prefix(&self, key: &str) -> Result<(), ObjectStorageError> { let files_scanned = Arc::new(AtomicU64::new(0)); - + let files_deleted = Arc::new(AtomicU64::new(0)); // Track LIST operation let list_start = Instant::now(); let object_stream = self.client.list(Some(&(key.into()))); @@ -417,23 +403,22 @@ impl S3 { .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); - let files_scanned_clone = files_scanned.clone(); object_stream .for_each_concurrent(None, |x| async { + files_scanned.fetch_add(1, Ordering::Relaxed); match x { Ok(obj) => { - files_scanned_clone.fetch_add(1, Ordering::Relaxed); - // Track individual DELETE operation + files_deleted.fetch_add(1, Ordering::Relaxed); let delete_start = Instant::now(); - match self.client.delete(&obj.location).await { + let delete_resp = self.client.delete(&obj.location).await; + let delete_elapsed = delete_start.elapsed().as_secs_f64(); + match delete_resp { Ok(_) => { - let delete_elapsed = delete_start.elapsed().as_secs_f64(); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "DELETE", "200"]) .observe(delete_elapsed); } Err(err) => { - let delete_elapsed = delete_start.elapsed().as_secs_f64(); let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "DELETE", status_code]) @@ -449,11 +434,12 @@ impl S3 { }) .await; - // Record total files scanned STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(files_scanned.load(Ordering::Relaxed) as f64); - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "DELETE"]) + .inc_by(files_deleted.load(Ordering::Relaxed) as f64); Ok(()) } @@ -461,10 +447,10 @@ impl S3 { let mut result_file_list = HashSet::new(); let mut total_files_scanned = 0u64; - // Track initial LIST operation let list_start = Instant::now(); let resp = self.client.list_with_delimiter(None).await?; let list_elapsed = list_start.elapsed().as_secs_f64(); + total_files_scanned += resp.objects.len() as u64; STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); @@ -522,9 +508,8 @@ impl S3 { } async fn _list_dates(&self, stream: &str) -> Result, ObjectStorageError> { - // Track LIST operation let list_start = Instant::now(); - let resp = self + let resp: Result = self .client .list_with_delimiter(Some(&(stream.into()))) .await; @@ -548,7 +533,6 @@ impl S3 { let common_prefixes = resp.common_prefixes; - // Record files scanned (prefixes/directories count as files scanned) STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(common_prefixes.len() as f64); @@ -580,6 +564,7 @@ impl S3 { let resp = match resp { Ok(resp) => { + total_files_scanned += resp.objects.len() as u64; STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); @@ -642,38 +627,30 @@ impl S3 { Ok(result_file_list) } - async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - // // TODO: Uncomment this when multipart is fixed - // let should_multipart = std::fs::metadata(path)?.len() > MULTIPART_UPLOAD_SIZE as u64; - - let should_multipart = false; - - if should_multipart { - // self._upload_multipart(key, path).await - // this branch will never get executed - Ok(()) - } else { - let bytes = tokio::fs::read(path).await?; - let put_start = Instant::now(); - let result = self.client.put(&key.into(), bytes.into()).await; - let put_elapsed = put_start.elapsed().as_secs_f64(); + async fn _upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { + let bytes = tokio::fs::read(path).await?; - match result { - Ok(result) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT", "200"]) - .observe(put_elapsed); - info!("Uploaded file to S3: {:?}", result); - Ok(()) - } - Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT", status_code]) - .observe(put_elapsed); - Err(err.into()) - } + let put_start = Instant::now(); + let result = self.client.put(&key.into(), bytes.into()).await; + let put_elapsed = put_start.elapsed().as_secs_f64(); + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "PUT"]) + .inc(); + match result { + Ok(result) => { + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", "200"]) + .observe(put_elapsed); + info!("Uploaded file to S3: {:?}", result); + Ok(()) + } + Err(err) => { + let status_code = error_to_status_code(&err); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "PUT", status_code]) + .observe(put_elapsed); + Err(err.into()) } } } @@ -694,14 +671,14 @@ impl S3 { let mut async_writer = match async_writer { Ok(writer) => { STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT_MULTIPART_INIT", "200"]) + .with_label_values(&["s3", "PUT_MULTIPART", "200"]) .observe(multipart_elapsed); writer } Err(err) => { let status_code = error_to_status_code(&err); STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "PUT_MULTIPART_INIT", status_code]) + .with_label_values(&["s3", "PUT_MULTIPART", status_code]) .observe(multipart_elapsed); return Err(err.into()); } @@ -717,7 +694,9 @@ impl S3 { let put_start = Instant::now(); let result = self.client.put(location, data.into()).await; let put_elapsed = put_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "PUT"]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -817,7 +796,9 @@ impl ObjectStorage for S3 { let head_start = Instant::now(); let meta = self.client.head(path).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "HEAD"]) + .inc(); let meta = match meta { Ok(meta) => { STORAGE_REQUEST_RESPONSE_TIME @@ -838,6 +819,7 @@ impl ObjectStorage for S3 { let buf = object_store::buffered::BufReader::new(store, &meta); Ok(buf) } + async fn upload_multipart( &self, key: &RelativePath, @@ -845,11 +827,14 @@ impl ObjectStorage for S3 { ) -> Result<(), ObjectStorageError> { self._upload_multipart(key, path).await } + async fn head(&self, path: &RelativePath) -> Result { let head_start = Instant::now(); let result = self.client.head(&to_object_store_path(path)).await; let head_elapsed = head_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "HEAD"]) + .inc(); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -872,14 +857,7 @@ impl ObjectStorage for S3 { } async fn get_object(&self, path: &RelativePath) -> Result { - let result = self._get_object(path).await?; - - // Record single file accessed - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "GET"]) - .inc(); - - Ok(result) + Ok(self._get_object(path).await?) } async fn get_objects( @@ -902,20 +880,9 @@ impl ObjectStorage for S3 { // Note: We track each streaming list item retrieval while let Some(meta_result) = list_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -933,13 +900,23 @@ impl ObjectStorage for S3 { .map_err(ObjectStorageError::PathError)?, ) .await?; + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "GET", "200"]) + .observe(list_start.elapsed().as_secs_f64()); + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "GET"]) + .inc(); res.push(byts); } + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); // Record total files scanned STORAGE_FILES_SCANNED - .with_label_values(&["s3", "GET"]) + .with_label_values(&["s3", "LIST"]) .inc_by(files_scanned as f64); Ok(res) @@ -956,20 +933,9 @@ impl ObjectStorage for S3 { let mut object_stream = self.client.list(Some(&self.root)); while let Some(meta_result) = object_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -981,7 +947,10 @@ impl ObjectStorage for S3 { path_arr.push(RelativePathBuf::from(meta.location.as_ref())); } } - + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); // Record total files scanned STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) @@ -1003,20 +972,9 @@ impl ObjectStorage for S3 { let mut object_stream = self.client.list(Some(&path)); while let Some(meta_result) = object_stream.next().await { - let list_elapsed = list_start.elapsed().as_secs_f64(); - let meta = match meta_result { - Ok(meta) => { - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", "200"]) - .observe(list_elapsed); - meta - } + Ok(meta) => meta, Err(err) => { - let status_code = error_to_status_code(&err); - STORAGE_REQUEST_RESPONSE_TIME - .with_label_values(&["s3", "LIST", status_code]) - .observe(list_elapsed); return Err(err.into()); } }; @@ -1028,6 +986,10 @@ impl ObjectStorage for S3 { path_arr.push(RelativePathBuf::from(meta.location.as_ref())); } } + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); path_arr.push(RelativePathBuf::from_iter([ stream_name, @@ -1052,11 +1014,6 @@ impl ObjectStorage for S3 { .await .map_err(|err| ObjectStorageError::ConnectionError(Box::new(err)))?; - // Record single file written - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "PUT"]) - .inc(); - Ok(()) } @@ -1113,6 +1070,9 @@ impl ObjectStorage for S3 { .observe(head_elapsed); } } + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "HEAD"]) + .inc(); Ok(result.map(|_| ())?) } @@ -1129,7 +1089,9 @@ impl ObjectStorage for S3 { let delete_start = Instant::now(); let result = self.client.delete(&to_object_store_path(&file)).await; let delete_elapsed = delete_start.elapsed().as_secs_f64(); - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "DELETE"]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -1143,15 +1105,7 @@ impl ObjectStorage for S3 { .with_label_values(&["s3", "DELETE", status_code]) .observe(delete_elapsed); - // if the object is not found, it is not an error - // the given url path was incorrect - if matches!(err, object_store::Error::NotFound { .. }) { - error!("Node does not exist"); - Err(err.into()) - } else { - error!("Error deleting node meta file: {:?}", err); - Err(err.into()) - } + Err(err.into()) } } } @@ -1224,7 +1178,12 @@ impl ObjectStorage for S3 { date: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/", stream_name, date)); + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); let hours: Vec = resp .common_prefixes @@ -1253,7 +1212,12 @@ impl ObjectStorage for S3 { hour: &str, ) -> Result, ObjectStorageError> { let pre = object_store::path::Path::from(format!("{}/{}/{}/", stream_name, date, hour)); + let list_start = Instant::now(); let resp = self.client.list_with_delimiter(Some(&pre)).await?; + let list_elapsed = list_start.elapsed().as_secs_f64(); + STORAGE_REQUEST_RESPONSE_TIME + .with_label_values(&["s3", "LIST", "200"]) + .observe(list_elapsed); let minutes: Vec = resp .common_prefixes @@ -1286,9 +1250,7 @@ impl ObjectStorage for S3 { } async fn upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - self._upload_file(key, path).await?; - - Ok(()) + Ok(self._upload_file(key, path).await?) } fn absolute_url(&self, prefix: &RelativePath) -> object_store::path::Path { From c88615f8d5c0559d844c168b60d9ea1508911cf9 Mon Sep 17 00:00:00 2001 From: Nikhil Sinha Date: Mon, 25 Aug 2025 08:10:53 -0700 Subject: [PATCH 3/3] add scanned metrics by date --- src/metadata.rs | 4 +- src/metrics/mod.rs | 12 +-- src/metrics/storage.rs | 36 ++++++- src/query/stream_schema_provider.rs | 12 ++- src/stats.rs | 14 ++- src/storage/azure_blob.rs | 150 +++++++++++++++++++--------- src/storage/gcs.rs | 102 +++++++++++++++++-- src/storage/localfs.rs | 39 ++++++-- src/storage/object_storage.rs | 2 +- src/storage/s3.rs | 101 +++++++++++++++++-- 10 files changed, 384 insertions(+), 88 deletions(-) diff --git a/src/metadata.rs b/src/metadata.rs index 34b5880b4..2d3bcae22 100644 --- a/src/metadata.rs +++ b/src/metadata.rs @@ -62,10 +62,10 @@ pub fn update_stats( .add(size as i64); TOTAL_EVENTS_INGESTED_DATE .with_label_values(&[origin, &parsed_date]) - .inc_by(num_rows as u64); + .add(num_rows as i64); TOTAL_EVENTS_INGESTED_SIZE_DATE .with_label_values(&[origin, &parsed_date]) - .inc_by(size); + .add(size as i64); } /// In order to support backward compatability with streams created before v1.6.4, diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index 81809eefd..27970c22d 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -160,8 +160,8 @@ pub static EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { .expect("metric can be created") }); -pub static TOTAL_EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { - IntCounterVec::new( +pub static TOTAL_EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( Opts::new( "total_events_ingested_date", "total events ingested on a particular date", @@ -172,8 +172,8 @@ pub static TOTAL_EVENTS_INGESTED_DATE: Lazy = Lazy::new(|| { .expect("metric can be created") }); -pub static TOTAL_EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { - IntCounterVec::new( +pub static TOTAL_EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( Opts::new( "total_events_ingested_size_date", "Total events ingested size in bytes on a particular date", @@ -184,8 +184,8 @@ pub static TOTAL_EVENTS_INGESTED_SIZE_DATE: Lazy = Lazy::new(|| { .expect("metric can be created") }); -pub static TOTAL_EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { - IntCounterVec::new( +pub static TOTAL_EVENTS_STORAGE_SIZE_DATE: Lazy = Lazy::new(|| { + IntGaugeVec::new( Opts::new( "total_events_storage_size_date", "Total events storage size in bytes on a particular date", diff --git a/src/metrics/storage.rs b/src/metrics/storage.rs index d483d0ee8..3386c451a 100644 --- a/src/metrics/storage.rs +++ b/src/metrics/storage.rs @@ -44,12 +44,24 @@ pub static STORAGE_FILES_SCANNED: Lazy = Lazy::new(|| { .expect("metric can be created") }); +pub static STORAGE_FILES_SCANNED_DATE: Lazy = Lazy::new(|| { + CounterVec::new( + Opts::new( + "storage_files_scanned_date_total", + "Total number of files scanned in storage operations by date", + ) + .namespace(METRICS_NAMESPACE), + &["provider", "operation", "date"], + ) + .expect("metric can be created") +}); + pub trait StorageMetrics { fn register_metrics(&self, handler: &PrometheusMetrics); } pub mod localfs { - use crate::storage::FSConfig; + use crate::{metrics::storage::STORAGE_FILES_SCANNED_DATE, storage::FSConfig}; use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; @@ -63,12 +75,16 @@ pub mod localfs { .registry .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); + handler + .registry + .register(Box::new(STORAGE_FILES_SCANNED_DATE.clone())) + .expect("metric can be registered"); } } } pub mod s3 { - use crate::storage::S3Config; + use crate::{metrics::storage::STORAGE_FILES_SCANNED_DATE, storage::S3Config}; use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; @@ -82,12 +98,16 @@ pub mod s3 { .registry .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); + handler + .registry + .register(Box::new(STORAGE_FILES_SCANNED_DATE.clone())) + .expect("metric can be registered"); } } } pub mod azureblob { - use crate::storage::AzureBlobConfig; + use crate::{metrics::storage::STORAGE_FILES_SCANNED_DATE, storage::AzureBlobConfig}; use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; @@ -101,12 +121,16 @@ pub mod azureblob { .registry .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); + handler + .registry + .register(Box::new(STORAGE_FILES_SCANNED_DATE.clone())) + .expect("metric can be registered"); } } } pub mod gcs { - use crate::storage::GcsConfig; + use crate::{metrics::storage::STORAGE_FILES_SCANNED_DATE, storage::GcsConfig}; use super::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}; @@ -120,6 +144,10 @@ pub mod gcs { .registry .register(Box::new(STORAGE_FILES_SCANNED.clone())) .expect("metric can be registered"); + handler + .registry + .register(Box::new(STORAGE_FILES_SCANNED_DATE.clone())) + .expect("metric can be registered"); } } } diff --git a/src/query/stream_schema_provider.rs b/src/query/stream_schema_provider.rs index efe0fad7f..ed4b64cd7 100644 --- a/src/query/stream_schema_provider.rs +++ b/src/query/stream_schema_provider.rs @@ -57,7 +57,10 @@ use crate::{ }, event::DEFAULT_TIMESTAMP_KEY, hottier::HotTierManager, - metrics::{QUERY_CACHE_HIT, storage::STORAGE_FILES_SCANNED}, + metrics::{ + QUERY_CACHE_HIT, + storage::{STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE}, + }, option::Mode, parseable::{PARSEABLE, STREAM_EXISTS}, storage::{ObjectStorage, ObjectStorageError, ObjectStoreFormat, STREAM_ROOT_DIRECTORY}, @@ -570,6 +573,13 @@ impl TableProvider for StandardTableProvider { STORAGE_FILES_SCANNED .with_label_values(&[PARSEABLE.storage().name(), "GET"]) .inc_by(parquet_files_to_scan as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&[ + PARSEABLE.storage().name(), + "GET", + &Utc::now().date_naive().to_string(), + ]) + .inc_by(parquet_files_to_scan as f64); let (partitioned_files, statistics) = self.partitioned_files(manifest_files); self.create_parquet_physical_plan( diff --git a/src/stats.rs b/src/stats.rs index 0c2214043..464a22b0a 100644 --- a/src/stats.rs +++ b/src/stats.rs @@ -137,13 +137,19 @@ pub async fn update_deleted_stats( "parquet", &manifest_date, ]); - let _ = TOTAL_EVENTS_INGESTED_DATE.remove_label_values(&["json", &manifest_date]); - let _ = TOTAL_EVENTS_INGESTED_SIZE_DATE.remove_label_values(&["json", &manifest_date]); - let _ = - TOTAL_EVENTS_STORAGE_SIZE_DATE.remove_label_values(&["parquet", &manifest_date]); + num_row += manifest.events_ingested as i64; ingestion_size += manifest.ingestion_size as i64; storage_size += manifest.storage_size as i64; + TOTAL_EVENTS_INGESTED_DATE + .with_label_values(&["json", &manifest_date]) + .sub(manifest.events_ingested as i64); + TOTAL_EVENTS_INGESTED_SIZE_DATE + .with_label_values(&["json", &manifest_date]) + .sub(manifest.ingestion_size as i64); + TOTAL_EVENTS_STORAGE_SIZE_DATE + .with_label_values(&["parquet", &manifest_date]) + .sub(manifest.storage_size as i64); } } EVENTS_DELETED diff --git a/src/storage/azure_blob.rs b/src/storage/azure_blob.rs index 2956dfe91..bb35c0628 100644 --- a/src/storage/azure_blob.rs +++ b/src/storage/azure_blob.rs @@ -28,6 +28,7 @@ use std::{ use async_trait::async_trait; use bytes::Bytes; +use chrono::Utc; use datafusion::{ datasource::listing::ListingTableUrl, execution::{ @@ -50,7 +51,10 @@ use url::Url; use crate::{ handlers::http::users::USERS_ROOT_DIR, - metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, + metrics::storage::{ + STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE, STORAGE_REQUEST_RESPONSE_TIME, + StorageMetrics, + }, parseable::LogStream, }; @@ -219,6 +223,9 @@ impl BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "GET"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "GET", &Utc::now().date_naive().to_string()]) + .inc(); match resp { Ok(resp) => { let body: Bytes = resp.bytes().await.unwrap(); @@ -248,6 +255,9 @@ impl BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -310,9 +320,15 @@ impl BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "LIST"]) .inc_by(files_scanned.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned.load(Ordering::Relaxed) as f64); STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "DELETE"]) .inc_by(files_deleted.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "DELETE", &Utc::now().date_naive().to_string()]) + .inc_by(files_deleted.load(Ordering::Relaxed) as f64); Ok(()) } @@ -376,6 +392,9 @@ impl BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "LIST"]) .inc_by(total_files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(total_files_scanned as f64); Ok(result_file_list) } @@ -409,6 +428,9 @@ impl BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "LIST"]) .inc_by(common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(common_prefixes.len() as f64); // return prefixes at the root level let dates: Vec<_> = common_prefixes @@ -497,6 +519,9 @@ impl BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "LIST"]) .inc_by(total_files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(total_files_scanned as f64); Ok(result_file_list) } @@ -510,6 +535,9 @@ impl BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); match result { Ok(result) => { STORAGE_REQUEST_RESPONSE_TIME @@ -653,49 +681,6 @@ impl BlobStore { } Ok(()) } - - // TODO: introduce parallel, multipart-uploads if required - // async fn _upload_multipart(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError> { - // let mut buf = vec![0u8; MULTIPART_UPLOAD_SIZE / 2]; - // let mut file = OpenOptions::new().read(true).open(path).await?; - - // // let (multipart_id, mut async_writer) = self.client.put_multipart(&key.into()).await?; - // let mut async_writer = self.client.put_multipart(&key.into()).await?; - - // /* `abort_multipart()` has been removed */ - // // let close_multipart = |err| async move { - // // error!("multipart upload failed. {:?}", err); - // // self.client - // // .abort_multipart(&key.into(), &multipart_id) - // // .await - // // }; - - // loop { - // match file.read(&mut buf).await { - // Ok(len) => { - // if len == 0 { - // break; - // } - // if let Err(err) = async_writer.write_all(&buf[0..len]).await { - // // close_multipart(err).await?; - // break; - // } - // if let Err(err) = async_writer.flush().await { - // // close_multipart(err).await?; - // break; - // } - // } - // Err(err) => { - // // close_multipart(err).await?; - // break; - // } - // } - // } - - // async_writer.shutdown().await?; - - // Ok(()) - // } } #[async_trait] @@ -733,6 +718,13 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&[ + "azure_blob", + "HEAD", + &Utc::now().date_naive().to_string(), + ]) + .inc(); } Err(err) => { let status_code = error_to_status_code(err); @@ -795,6 +787,9 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "GET"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "GET", &Utc::now().date_naive().to_string()]) + .inc(); res.push(byts); } @@ -807,6 +802,9 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "LIST"]) .inc_by(files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(res) } @@ -844,6 +842,9 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "LIST"]) .inc_by(files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -890,6 +891,9 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "LIST"]) .inc_by(files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -926,6 +930,13 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "DELETE"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&[ + "azure_blob", + "DELETE", + &Utc::now().date_naive().to_string(), + ]) + .inc(); } Err(err) => { let status_code = error_to_status_code(err); @@ -962,6 +973,9 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); Ok(result.map(|_| ())?) } @@ -981,6 +995,9 @@ impl ObjectStorage for BlobStore { STORAGE_FILES_SCANNED .with_label_values(&["azure_blob", "DELETE"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "DELETE", &Utc::now().date_naive().to_string()]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -1013,7 +1030,12 @@ impl ObjectStorage for BlobStore { .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs - + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(common_prefixes.len() as f64); // return prefixes at the root level let dirs: HashSet<_> = common_prefixes .iter() @@ -1049,6 +1071,12 @@ impl ObjectStorage for BlobStore { }; stream_json_check.push(task); } + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "HEAD"]) + .inc_by(dirs.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "HEAD", &Utc::now().date_naive().to_string()]) + .inc_by(dirs.len() as f64); stream_json_check.try_collect::<()>().await?; @@ -1073,6 +1101,12 @@ impl ObjectStorage for BlobStore { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); let hours: Vec = resp .common_prefixes @@ -1107,6 +1141,12 @@ impl ObjectStorage for BlobStore { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["azure_blob", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); let minutes: Vec = resp .common_prefixes @@ -1176,6 +1216,16 @@ impl ObjectStorage for BlobStore { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&[ + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ]) + .inc_by(resp.common_prefixes.len() as f64); resp } Err(err) => { @@ -1210,6 +1260,16 @@ impl ObjectStorage for BlobStore { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["azure_blob", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["azure_blob", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&[ + "azure_blob", + "LIST", + &Utc::now().date_naive().to_string(), + ]) + .inc_by(resp.common_prefixes.len() as f64); resp } Err(err) => { diff --git a/src/storage/gcs.rs b/src/storage/gcs.rs index 9af321533..dbce373b0 100644 --- a/src/storage/gcs.rs +++ b/src/storage/gcs.rs @@ -28,11 +28,15 @@ use std::{ use crate::{ handlers::http::users::USERS_ROOT_DIR, - metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, + metrics::storage::{ + STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE, STORAGE_REQUEST_RESPONSE_TIME, + StorageMetrics, + }, parseable::LogStream, }; use async_trait::async_trait; use bytes::Bytes; +use chrono::Utc; use datafusion::{ datasource::listing::ListingTableUrl, execution::{ @@ -184,6 +188,9 @@ impl Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "GET"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "GET", &Utc::now().date_naive().to_string()]) + .inc(); match resp { Ok(resp) => { let body: Bytes = resp.bytes().await.unwrap(); @@ -213,6 +220,9 @@ impl Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -275,9 +285,15 @@ impl Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "LIST"]) .inc_by(files_scanned.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned.load(Ordering::Relaxed) as f64); STORAGE_FILES_SCANNED .with_label_values(&["gcs", "DELETE"]) .inc_by(files_deleted.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "DELETE", &Utc::now().date_naive().to_string()]) + .inc_by(files_deleted.load(Ordering::Relaxed) as f64); Ok(()) } @@ -341,6 +357,9 @@ impl Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "LIST"]) .inc_by(total_files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(total_files_scanned as f64); Ok(result_file_list) } @@ -374,6 +393,9 @@ impl Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "LIST"]) .inc_by(common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(common_prefixes.len() as f64); // return prefixes at the root level let dates: Vec<_> = common_prefixes @@ -462,6 +484,9 @@ impl Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "LIST"]) .inc_by(total_files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(total_files_scanned as f64); Ok(result_file_list) } @@ -475,6 +500,9 @@ impl Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); match result { Ok(result) => { STORAGE_REQUEST_RESPONSE_TIME @@ -632,6 +660,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); let meta = match meta { Ok(meta) => { STORAGE_REQUEST_RESPONSE_TIME @@ -668,15 +699,14 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "HEAD", "200"]) .observe(head_elapsed); - // Record single file accessed - STORAGE_FILES_SCANNED - .with_label_values(&["gcs", "HEAD"]) - .inc(); } Err(err) => { let status_code = error_to_status_code(err); @@ -739,6 +769,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "GET"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "GET", &Utc::now().date_naive().to_string()]) + .inc(); res.push(byts); } @@ -751,6 +784,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "LIST"]) .inc_by(files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(res) } @@ -788,7 +824,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "LIST"]) .inc_by(files_scanned as f64); - + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -834,7 +872,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "LIST"]) .inc_by(files_scanned as f64); - + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -870,6 +910,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "DELETE"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "DELETE", &Utc::now().date_naive().to_string()]) + .inc(); } Err(err) => { let status_code = error_to_status_code(err); @@ -906,6 +949,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); Ok(result.map(|_| ())?) } @@ -925,6 +971,9 @@ impl ObjectStorage for Gcs { STORAGE_FILES_SCANNED .with_label_values(&["gcs", "DELETE"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "DELETE", &Utc::now().date_naive().to_string()]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -957,7 +1006,12 @@ impl ObjectStorage for Gcs { .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs - + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(common_prefixes.len() as f64); // return prefixes at the root level let dirs: HashSet<_> = common_prefixes .iter() @@ -993,6 +1047,12 @@ impl ObjectStorage for Gcs { }; stream_json_check.push(task); } + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "HEAD"]) + .inc_by(dirs.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "HEAD", &Utc::now().date_naive().to_string()]) + .inc_by(dirs.len() as f64); stream_json_check.try_collect::<()>().await?; @@ -1017,7 +1077,12 @@ impl ObjectStorage for Gcs { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); - + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); let hours: Vec = resp .common_prefixes .iter() @@ -1051,7 +1116,12 @@ impl ObjectStorage for Gcs { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); - + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); let minutes: Vec = resp .common_prefixes .iter() @@ -1116,6 +1186,12 @@ impl ObjectStorage for Gcs { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); resp } Err(err) => { @@ -1150,6 +1226,12 @@ impl ObjectStorage for Gcs { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["gcs", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["gcs", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["gcs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); resp } Err(err) => { diff --git a/src/storage/localfs.rs b/src/storage/localfs.rs index 2822b6b68..985ece9a3 100644 --- a/src/storage/localfs.rs +++ b/src/storage/localfs.rs @@ -25,6 +25,7 @@ use std::{ use async_trait::async_trait; use bytes::Bytes; +use chrono::Utc; use datafusion::{datasource::listing::ListingTableUrl, execution::runtime_env::RuntimeEnvBuilder}; use fs_extra::file::CopyOptions; use futures::{TryStreamExt, stream::FuturesUnordered}; @@ -38,7 +39,10 @@ use tokio_stream::wrappers::ReadDirStream; use crate::{ handlers::http::users::USERS_ROOT_DIR, - metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, + metrics::storage::{ + STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE, STORAGE_REQUEST_RESPONSE_TIME, + StorageMetrics, + }, option::validation, parseable::LogStream, storage::SETTINGS_ROOT_DIRECTORY, @@ -134,7 +138,9 @@ impl ObjectStorage for LocalFS { STORAGE_FILES_SCANNED .with_label_values(&["localfs", "HEAD"]) .inc(); - + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["localfs", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); Err(ObjectStorageError::UnhandledError(Box::new( std::io::Error::new( std::io::ErrorKind::Unsupported, @@ -158,22 +164,24 @@ impl ObjectStorage for LocalFS { STORAGE_FILES_SCANNED .with_label_values(&["localfs", "GET"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["localfs", "GET", &Utc::now().date_naive().to_string()]) + .inc(); Ok(x.into()) } - Err(e) => match e.kind() { - std::io::ErrorKind::NotFound => { + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["localfs", "GET", "404"]) .observe(get_elapsed); Err(ObjectStorageError::NoSuchKey(path.to_string())) - } - _ => { + } else { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["localfs", "GET", "500"]) .observe(get_elapsed); Err(ObjectStorageError::UnhandledError(Box::new(e))) } - }, + } }; res @@ -227,7 +235,9 @@ impl ObjectStorage for LocalFS { STORAGE_FILES_SCANNED .with_label_values(&["localfs", "LIST"]) .inc_by(files_scanned as f64); - + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["localfs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -291,7 +301,9 @@ impl ObjectStorage for LocalFS { STORAGE_FILES_SCANNED .with_label_values(&["localfs", "LIST"]) .inc_by(files_scanned as f64); - + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["localfs", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -370,6 +382,9 @@ impl ObjectStorage for LocalFS { STORAGE_FILES_SCANNED .with_label_values(&["localfs", "GET"]) .inc_by(files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["localfs", "GET", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); // maybe change the return code @@ -399,6 +414,9 @@ impl ObjectStorage for LocalFS { STORAGE_FILES_SCANNED .with_label_values(&["localfs", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["localfs", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); } Err(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -455,6 +473,9 @@ impl ObjectStorage for LocalFS { STORAGE_FILES_SCANNED .with_label_values(&["localfs", "DELETE"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["localfs", "DELETE", &Utc::now().date_naive().to_string()]) + .inc(); } Err(err) => { let status_code = match err.kind() { diff --git a/src/storage/object_storage.rs b/src/storage/object_storage.rs index ba4675b0e..fa006becf 100644 --- a/src/storage/object_storage.rs +++ b/src/storage/object_storage.rs @@ -162,7 +162,7 @@ fn update_storage_metrics( .add(compressed_size as i64); TOTAL_EVENTS_STORAGE_SIZE_DATE .with_label_values(&["parquet", file_date_part]) - .inc_by(compressed_size); + .add(compressed_size as i64); Ok(()) } diff --git a/src/storage/s3.rs b/src/storage/s3.rs index 59a0efe0e..d594f15f4 100644 --- a/src/storage/s3.rs +++ b/src/storage/s3.rs @@ -30,6 +30,7 @@ use std::{ use async_trait::async_trait; use bytes::Bytes; +use chrono::Utc; use datafusion::{ datasource::listing::ListingTableUrl, execution::{ @@ -51,7 +52,10 @@ use tracing::{error, info}; use crate::{ handlers::http::users::USERS_ROOT_DIR, - metrics::storage::{STORAGE_FILES_SCANNED, STORAGE_REQUEST_RESPONSE_TIME, StorageMetrics}, + metrics::storage::{ + STORAGE_FILES_SCANNED, STORAGE_FILES_SCANNED_DATE, STORAGE_REQUEST_RESPONSE_TIME, + StorageMetrics, + }, parseable::LogStream, }; @@ -346,6 +350,9 @@ impl S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "GET"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "GET", &Utc::now().date_naive().to_string()]) + .inc(); match resp { Ok(resp) => { let body: Bytes = resp.bytes().await.unwrap(); @@ -375,6 +382,9 @@ impl S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); match resp { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -437,9 +447,15 @@ impl S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(files_scanned.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned.load(Ordering::Relaxed) as f64); STORAGE_FILES_SCANNED .with_label_values(&["s3", "DELETE"]) .inc_by(files_deleted.load(Ordering::Relaxed) as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "DELETE", &Utc::now().date_naive().to_string()]) + .inc_by(files_deleted.load(Ordering::Relaxed) as f64); Ok(()) } @@ -503,6 +519,9 @@ impl S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(total_files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(total_files_scanned as f64); Ok(result_file_list) } @@ -536,6 +555,9 @@ impl S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(common_prefixes.len() as f64); // return prefixes at the root level let dates: Vec<_> = common_prefixes @@ -624,6 +646,9 @@ impl S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(total_files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(total_files_scanned as f64); Ok(result_file_list) } @@ -637,6 +662,9 @@ impl S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); match result { Ok(result) => { STORAGE_REQUEST_RESPONSE_TIME @@ -697,6 +725,9 @@ impl S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "PUT"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "PUT", &Utc::now().date_naive().to_string()]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -799,6 +830,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); let meta = match meta { Ok(meta) => { STORAGE_REQUEST_RESPONSE_TIME @@ -835,15 +869,14 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); match &result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "HEAD", "200"]) .observe(head_elapsed); - // Record single file accessed - STORAGE_FILES_SCANNED - .with_label_values(&["s3", "HEAD"]) - .inc(); } Err(err) => { let status_code = error_to_status_code(err); @@ -906,6 +939,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "GET"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "GET", &Utc::now().date_naive().to_string()]) + .inc(); res.push(byts); } @@ -918,6 +954,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(res) } @@ -955,6 +994,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -1001,6 +1043,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "LIST"]) .inc_by(files_scanned as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(files_scanned as f64); Ok(path_arr) } @@ -1037,6 +1082,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "DELETE"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "DELETE", &Utc::now().date_naive().to_string()]) + .inc(); } Err(err) => { let status_code = error_to_status_code(err); @@ -1073,6 +1121,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "HEAD"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "HEAD", &Utc::now().date_naive().to_string()]) + .inc(); Ok(result.map(|_| ())?) } @@ -1092,6 +1143,9 @@ impl ObjectStorage for S3 { STORAGE_FILES_SCANNED .with_label_values(&["s3", "DELETE"]) .inc(); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "DELETE", &Utc::now().date_naive().to_string()]) + .inc(); match result { Ok(_) => { STORAGE_REQUEST_RESPONSE_TIME @@ -1124,7 +1178,12 @@ impl ObjectStorage for S3 { .observe(list_elapsed); let common_prefixes = resp.common_prefixes; // get all dirs - + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(common_prefixes.len() as f64); // return prefixes at the root level let dirs: HashSet<_> = common_prefixes .iter() @@ -1160,6 +1219,12 @@ impl ObjectStorage for S3 { }; stream_json_check.push(task); } + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "HEAD"]) + .inc_by(dirs.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "HEAD", &Utc::now().date_naive().to_string()]) + .inc_by(dirs.len() as f64); stream_json_check.try_collect::<()>().await?; @@ -1184,6 +1249,12 @@ impl ObjectStorage for S3 { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); let hours: Vec = resp .common_prefixes @@ -1218,6 +1289,12 @@ impl ObjectStorage for S3 { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); let minutes: Vec = resp .common_prefixes @@ -1283,6 +1360,12 @@ impl ObjectStorage for S3 { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); resp } Err(err) => { @@ -1317,6 +1400,12 @@ impl ObjectStorage for S3 { STORAGE_REQUEST_RESPONSE_TIME .with_label_values(&["s3", "LIST", "200"]) .observe(list_elapsed); + STORAGE_FILES_SCANNED + .with_label_values(&["s3", "LIST"]) + .inc_by(resp.common_prefixes.len() as f64); + STORAGE_FILES_SCANNED_DATE + .with_label_values(&["s3", "LIST", &Utc::now().date_naive().to_string()]) + .inc_by(resp.common_prefixes.len() as f64); resp } Err(err) => {