@@ -280,6 +280,132 @@ def number_of_chunks(self):
280
280
return self .number_of_chunks
281
281
282
282
283
+ class PolarsDataframeSerializer :
284
+ """Serialize DataFrame into LineProtocols."""
285
+
286
+ def __init__ (self , data_frame , point_settings , precision = DEFAULT_WRITE_PRECISION , chunk_size : int = None ,
287
+ ** kwargs ) -> None :
288
+ """
289
+ Init serializer.
290
+
291
+ :param data_frame: Polars DataFrame to serialize
292
+ :param point_settings: Default Tags
293
+ :param precision: The precision for the unix timestamps within the body line-protocol.
294
+ :param chunk_size: The size of chunk for serializing into chunks.
295
+ :key data_frame_measurement_name: name of measurement for writing Polars DataFrame
296
+ :key data_frame_tag_columns: list of DataFrame columns which are tags, rest columns will be fields
297
+ :key data_frame_timestamp_column: name of DataFrame column which contains a timestamp.
298
+ :key data_frame_timestamp_timezone: name of the timezone which is used for timestamp column
299
+ """
300
+
301
+
302
+ self .data_frame = data_frame
303
+ self .point_settings = point_settings
304
+ self .precision = precision
305
+ self .chunk_size = chunk_size
306
+ self .measurement_name = kwargs .get ("data_frame_measurement_name" , "measurement" )
307
+ self .tag_columns = kwargs .get ("data_frame_tag_columns" , [])
308
+ self .timestamp_column = kwargs .get ("data_frame_timestamp_column" , None )
309
+ self .timestamp_timezone = kwargs .get ("data_frame_timestamp_timezone" , None )
310
+
311
+ self .column_indices = {name : index for index , name in enumerate (data_frame .columns )}
312
+
313
+ #
314
+ # prepare chunks
315
+ #
316
+ if chunk_size is not None :
317
+ self .number_of_chunks = int (math .ceil (len (data_frame ) / float (chunk_size )))
318
+ self .chunk_size = chunk_size
319
+ else :
320
+ self .number_of_chunks = None
321
+
322
+ def escape_value (self ,value ):
323
+ return str (value ).translate (_ESCAPE_KEY )
324
+
325
+
326
+ def to_line_protocol (self , row ):
327
+ # Filter out None or empty values for tags
328
+ tags = ""
329
+
330
+ tags = "," .join (
331
+ f'{ self .escape_value (col )} ={ self .escape_value (row [self .column_indices [col ]])} '
332
+ for col in self .tag_columns
333
+ if row [self .column_indices [col ]] is not None and row [self .column_indices [col ]] != ""
334
+ )
335
+
336
+ if self .point_settings .defaultTags :
337
+ default_tags = "," .join (
338
+ f'{ self .escape_value (key )} ={ self .escape_value (value )} '
339
+ for key , value in self .point_settings .defaultTags .items ()
340
+ )
341
+ # Ensure there's a comma between existing tags and default tags if both are present
342
+ if tags and default_tags :
343
+ tags += ","
344
+ tags += default_tags
345
+
346
+
347
+
348
+
349
+ # add escape symbols for special characters to tags
350
+
351
+ fields = "," .join (
352
+ f"{ col } =\" { row [self .column_indices [col ]]} \" " if isinstance (row [self .column_indices [col ]], str )
353
+ else f"{ col } ={ row [self .column_indices [col ]]} i" if isinstance (row [self .column_indices [col ]], int )
354
+ else f"{ col } ={ row [self .column_indices [col ]]} "
355
+ for col in self .column_indices
356
+ if col not in self .tag_columns + [self .timestamp_column ]
357
+ and row [self .column_indices [col ]] is not None and row [self .column_indices [col ]] != ""
358
+ )
359
+
360
+ # Access the Unix timestamp
361
+ timestamp = row [self .column_indices [self .timestamp_column ]]
362
+ if tags != "" :
363
+ line_protocol = f"{ self .measurement_name } ,{ tags } { fields } { timestamp } "
364
+ else :
365
+ line_protocol = f"{ self .measurement_name } { fields } { timestamp } "
366
+
367
+ return line_protocol
368
+
369
+
370
+ def serialize (self , chunk_idx : int = None ):
371
+ from ...extras import pl
372
+
373
+ df = self .data_frame
374
+
375
+ # Convert timestamp to unix timestamp
376
+ print (self .precision )
377
+ if self .precision is None :
378
+ df = df .with_columns (pl .col (self .timestamp_column ).dt .epoch (time_unit = "ns" ).alias (self .timestamp_column ))
379
+ elif self .precision == 'ns' :
380
+ df = df .with_columns (pl .col (self .timestamp_column ).dt .epoch (time_unit = "ns" ).alias (self .timestamp_column ))
381
+ elif self .precision == 'us' :
382
+ df = df .with_columns (pl .col (self .timestamp_column ).dt .epoch (time_unit = "us" ).alias (self .timestamp_column ))
383
+ elif self .precision == 'ms' :
384
+ df = df .with_columns (pl .col (self .timestamp_column ).dt .epoch (time_unit = "ms" ).alias (self .timestamp_column ))
385
+ elif self .precision == 's' :
386
+ df = df .with_columns (pl .col (self .timestamp_column ).dt .epoch (time_unit = "s" ).alias (self .timestamp_column ))
387
+ else :
388
+ raise ValueError (f"Unsupported precision: { self .precision } " )
389
+
390
+ if chunk_idx is None :
391
+ chunk = df
392
+ else :
393
+ logger .debug ("Serialize chunk %s/%s ..." , chunk_idx + 1 , self .number_of_chunks )
394
+ chunk = df [chunk_idx * self .chunk_size :(chunk_idx + 1 ) * self .chunk_size ]
395
+
396
+ # Apply the UDF to each row
397
+ line_protocol_expr = chunk .apply (self .to_line_protocol ,return_dtype = pl .Object )
398
+
399
+ lp = line_protocol_expr ['map' ].to_list ()
400
+
401
+
402
+ return lp
403
+
404
+
405
+
406
+
407
+
408
+
283
409
def data_frame_to_list_of_points (data_frame , point_settings , precision = DEFAULT_WRITE_PRECISION , ** kwargs ):
284
410
"""
285
411
Serialize DataFrame into LineProtocols.
@@ -295,3 +421,19 @@ def data_frame_to_list_of_points(data_frame, point_settings, precision=DEFAULT_W
295
421
:key data_frame_timestamp_timezone: name of the timezone which is used for timestamp column - ``DataFrame``
296
422
""" # noqa: E501
297
423
return DataframeSerializer (data_frame , point_settings , precision , ** kwargs ).serialize ()
424
+
425
+ def polars_data_frame_to_list_of_points (data_frame , point_settings , precision = DEFAULT_WRITE_PRECISION , ** kwargs ):
426
+ """
427
+ Serialize DataFrame into LineProtocols.
428
+
429
+ :param data_frame: Pandas DataFrame to serialize
430
+ :param point_settings: Default Tags
431
+ :param precision: The precision for the unix timestamps within the body line-protocol.
432
+ :key data_frame_measurement_name: name of measurement for writing Pandas DataFrame
433
+ :key data_frame_tag_columns: list of DataFrame columns which are tags, rest columns will be fields
434
+ :key data_frame_timestamp_column: name of DataFrame column which contains a timestamp. The column can be defined as a :class:`~str` value
435
+ formatted as `2018-10-26`, `2018-10-26 12:00`, `2018-10-26 12:00:00-05:00`
436
+ or other formats and types supported by `pandas.to_datetime <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html#pandas.to_datetime>`_ - ``DataFrame``
437
+ :key data_frame_timestamp_timezone: name of the timezone which is used for timestamp column - ``DataFrame``
438
+ """ # noqa: E501
439
+ return PolarsDataframeSerializer (data_frame , point_settings , precision , ** kwargs ).serialize ()
0 commit comments