1
1
import os
2
2
import re
3
+ from typing import Any , Callable
3
4
4
5
import boto3
5
6
import pandas as pd
8
9
9
10
10
11
class S3DataSource (IDataSource ):
11
- default_bucket = os .environ .get ("S3_BUCKET_NAME" , "caltrans-pems-prd-us-west-2-marts" )
12
+ """A data source for fetching data from an S3 bucket."""
13
+
14
+ @property
15
+ def default_bucket (self ) -> str :
16
+ """
17
+ Returns:
18
+ value (str): The value from the `S3_BUCKET_NAME` environment variable, or the Caltrans PeMS prod mart bucket name.
19
+ """
20
+ return os .environ .get ("S3_BUCKET_NAME" , "caltrans-pems-prd-us-west-2-marts" )
21
+
22
+ @property
23
+ def name (self ) -> str :
24
+ """
25
+ Returns:
26
+ value (str): The name of this bucket instance.
27
+ """
28
+ return self ._name
12
29
13
30
def __init__ (self , name : str = None ):
14
- self .name = name or self .default_bucket
31
+ """Initialize a new S3DataSource.
32
+
33
+ Args:
34
+ name (str): (Optional) The name of the S3 bucket to source from.
35
+ """
15
36
self ._client = boto3 .client ("s3" )
37
+ self ._name = name or self .default_bucket
16
38
17
- def get_prefixes (self , filter_pattern : re .Pattern = re .compile (".+" ), initial_prefix : str = "" , match_func = None ) -> list :
39
+ def get_prefixes (
40
+ self ,
41
+ filter_pattern : re .Pattern = re .compile (".+" ),
42
+ initial_prefix : str = "" ,
43
+ match_func : Callable [[re .Match ], str ] = None ,
44
+ ) -> list :
18
45
"""
19
- Lists available filter options by inspecting S3 prefixes. Optionally filter by an initial prefix.
46
+ Lists available object prefixes, optionally filtered by an initial prefix.
20
47
21
48
When a match is found, if match_func exists, add its result to the output list. Otherwise add the entire match.
49
+
50
+ Args:
51
+ filter_pattern (re.Pattern): A regular expression used to match object prefixes
52
+ initial_prefix (str): The initial prefix to start the search from
53
+ match_func (Callable[[re.Match], str]): A callable used to extract data from prefix matches
54
+
55
+ Returns:
56
+ value (list): A sorted list of unique prefixes that matched the pattern.
22
57
"""
23
58
24
59
s3_keys = self ._client .list_objects (Bucket = self .name , Prefix = initial_prefix )
@@ -36,20 +71,33 @@ def get_prefixes(self, filter_pattern: re.Pattern = re.compile(".+"), initial_pr
36
71
37
72
return sorted (result )
38
73
39
- def read (self , * args : str , path = None , columns = None , filters = None , ** kwargs ) -> pd .DataFrame :
40
- """Reads data from the S3 path into a pandas DataFrame. Extra kwargs are pass along to `pandas.read_parquet()`.
74
+ def read (
75
+ self , * args : str , path : str = None , columns : list = None , filters : list = None , ** kwargs : dict [str , Any ]
76
+ ) -> pd .DataFrame :
77
+ """Reads data from the S3 path into a pandas DataFrame. Extra kwargs are passed along to `pandas.read_parquet()`.
41
78
42
79
Args:
43
- *args (str): One or more path relative path components for the data file.
44
- path (str): The absolute S3 URL path to a data file. Using `path` overrides any relative path components provided.
45
- columns (list[str]): If not None, only these columns will be read from the file.
46
- filters (list[tuple] | list[list[tuple]]): To filter out data. Filter syntax: `[[(column, op, val), ...],...]`.
80
+ *args (tuple[str]): One or more path relative path components for the data file
81
+ path (str): The absolute S3 URL path to a data file; using `path` overrides any relative path components provided
82
+ columns (list[str]): If not None, only these columns will be read from the file
83
+ filters (list[tuple] | list[list[tuple]]): To filter out data. Filter syntax: `[[(column, op, val), ...],...]`
84
+ **kwargs (dict[str, Any]): Extra kwargs to pass to `pandas.read_parquet()`
85
+
86
+ Returns:
87
+ value (pandas.DataFrame): A DataFrame of data read from the source path.
47
88
"""
48
89
path = path or self .url (* args )
49
90
return pd .read_parquet (path , columns = columns , filters = filters , ** kwargs )
50
91
51
- def url (self , * args ):
52
- """Build an absolute S3 URL to this bucket, with optional path segments."""
92
+ def url (self , * args : str ) -> str :
93
+ """Build an absolute S3 URL to this bucket, with optional path segments.
94
+
95
+ Args:
96
+ *args (tuple[str]): The components of the S3 path.
97
+
98
+ Returns:
99
+ value (str): An absolute `s3://` URL for this bucket and the path.
100
+ """
53
101
parts = [f"s3://{ self .name } " ]
54
102
parts .extend (args )
55
103
return "/" .join (parts )
0 commit comments