@@ -73,4 +73,122 @@ def export_to_df(self, token: str) -> pd.DataFrame:
73
73
raise ValueError ("No .txt file found in zip archive" )
74
74
75
75
with zip_ref .open (txt_files [0 ]) as file :
76
- return pd .read_csv (file , sep = "\t " )
76
+ # First read to get column names
77
+ header_df = pd .read_csv (file , sep = "\t " , nrows = 0 )
78
+ columns = header_df .columns
79
+
80
+ # Reset file pointer to beginning
81
+ file .seek (0 )
82
+
83
+ # Identify datetime columns
84
+ date_columns = [
85
+ col
86
+ for col in columns
87
+ if any (time_word in col .lower () for time_word in ["date" , "time" , "created" , "modified" , "updated" ])
88
+ ]
89
+
90
+ # Identify boolean columns (common boolean column names)
91
+ bool_columns = [
92
+ col
93
+ for col in columns
94
+ if any (
95
+ bool_word in col .lower ()
96
+ for bool_word in ["is" , "has" , "can" , "allow" , "enabled" , "active" , "visible" ]
97
+ )
98
+ ]
99
+
100
+ # First read everything as string to identify columns with boolean values
101
+ temp_df = pd .read_csv (file , sep = "\t " , dtype = str , nrows = 1000 )
102
+ file .seek (0 ) # Reset file pointer again
103
+
104
+ # Find additional boolean columns by checking content
105
+ for col in columns :
106
+ if col not in bool_columns :
107
+ unique_vals = set (temp_df [col ].dropna ().unique ())
108
+ if unique_vals .issubset ({"True" , "False" , "" }) and len (unique_vals ) > 0 :
109
+ bool_columns .append (col )
110
+
111
+ # Identify columns that must always be strings
112
+ string_columns = [
113
+ col
114
+ for col in columns
115
+ if any (id_word in col .lower () for id_word in ["sku" , "id" , "code" , "number" , "upc" , "ean" , "isbn" ])
116
+ ]
117
+
118
+ # Read CSV with specific data types
119
+ df = pd .read_csv (
120
+ file ,
121
+ sep = "\t " ,
122
+ dtype = {
123
+ ** {col : str for col in columns }, # default all to string
124
+ ** {col : str for col in string_columns }, # ensure ID-like columns are strings
125
+ ** {
126
+ col : "Int64" # Use nullable integer type
127
+ for col in columns
128
+ if ("Quantity" in col or "Qty" in col )
129
+ and "Suggestion" not in col
130
+ and col not in date_columns
131
+ and col not in bool_columns
132
+ and col not in string_columns # exclude string columns from integer conversion
133
+ },
134
+ ** {
135
+ col : float
136
+ for col in columns
137
+ if ("Cost" in col or "Price" in col )
138
+ and "Competitor" not in col
139
+ and "Dealer" not in col
140
+ and "SalePrice" not in col
141
+ and col not in date_columns
142
+ and col not in bool_columns
143
+ and col not in string_columns # exclude string columns from float conversion
144
+ },
145
+ },
146
+ parse_dates = date_columns , # Convert date columns to datetime
147
+ na_values = ["" , "nan" , "NaN" , "NULL" ],
148
+ keep_default_na = False ,
149
+ )
150
+
151
+ # Fill NA values based on column type
152
+ for col in df .columns :
153
+ if df [col ].dtype == "Int64" :
154
+ df [col ] = df [col ].fillna (0 )
155
+ elif df [col ].dtype == "float64" :
156
+ df [col ] = df [col ].fillna (0.0 )
157
+ elif col not in date_columns : # Don't fill NA in date columns
158
+ df [col ] = df [col ].fillna ("" )
159
+
160
+ # Convert boolean columns after handling NA values
161
+ for col in bool_columns :
162
+ df [col ] = df [col ].map ({"True" : True , "False" : False , "" : False })
163
+
164
+ return df
165
+
166
+
167
+ def transform_attributes (ca_catalog : pd .DataFrame ) -> pd .DataFrame :
168
+ # Create a list of attribute pairs (name and value columns)
169
+ attribute_pairs = [(f"Attribute{ i } Name" , f"Attribute{ i } Value" ) for i in range (1 , 142 )]
170
+
171
+ # Initialize an empty dictionary to store the transformed data
172
+ transformed_data = {}
173
+
174
+ # Iterate through each attribute pair
175
+ for name_col , value_col in attribute_pairs :
176
+ # Get unique attribute names from the name column
177
+ attr_names = ca_catalog [name_col ].dropna ().unique ()
178
+
179
+ # For each unique attribute name, create a new column with the corresponding values
180
+ for attr_name in attr_names :
181
+ if pd .notna (attr_name ): # Skip empty/null attribute names
182
+ # Create column name with 'attr:' prefix
183
+ new_col_name = f"attr:{ attr_name } "
184
+ # Get values where attribute name matches
185
+ mask = ca_catalog [name_col ] == attr_name
186
+ transformed_data [new_col_name ] = ca_catalog [value_col ].where (mask , "" ).fillna ("" ).astype (str )
187
+
188
+ # Create a new dataframe with the transformed attributes
189
+ attr_df = pd .DataFrame (transformed_data )
190
+
191
+ # Combine with original dataframe (excluding the original attribute columns)
192
+ original_cols = [col for col in ca_catalog .columns if not col .startswith ("Attribute" )]
193
+ result_df = pd .concat ([ca_catalog [original_cols ], attr_df ], axis = 1 )
194
+ return result_df
0 commit comments