Skip to content

Commit 83bbfb6

Browse files
committed
feat(export): fixed dtypes in export
1 parent be26b3a commit 83bbfb6

File tree

4 files changed

+2113
-62
lines changed

4 files changed

+2113
-62
lines changed

channel_advisor_api/models/channel_advisor_export.py

Lines changed: 119 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,4 +73,122 @@ def export_to_df(self, token: str) -> pd.DataFrame:
7373
raise ValueError("No .txt file found in zip archive")
7474

7575
with zip_ref.open(txt_files[0]) as file:
76-
return pd.read_csv(file, sep="\t")
76+
# First read to get column names
77+
header_df = pd.read_csv(file, sep="\t", nrows=0)
78+
columns = header_df.columns
79+
80+
# Reset file pointer to beginning
81+
file.seek(0)
82+
83+
# Identify datetime columns
84+
date_columns = [
85+
col
86+
for col in columns
87+
if any(time_word in col.lower() for time_word in ["date", "time", "created", "modified", "updated"])
88+
]
89+
90+
# Identify boolean columns (common boolean column names)
91+
bool_columns = [
92+
col
93+
for col in columns
94+
if any(
95+
bool_word in col.lower()
96+
for bool_word in ["is", "has", "can", "allow", "enabled", "active", "visible"]
97+
)
98+
]
99+
100+
# First read everything as string to identify columns with boolean values
101+
temp_df = pd.read_csv(file, sep="\t", dtype=str, nrows=1000)
102+
file.seek(0) # Reset file pointer again
103+
104+
# Find additional boolean columns by checking content
105+
for col in columns:
106+
if col not in bool_columns:
107+
unique_vals = set(temp_df[col].dropna().unique())
108+
if unique_vals.issubset({"True", "False", ""}) and len(unique_vals) > 0:
109+
bool_columns.append(col)
110+
111+
# Identify columns that must always be strings
112+
string_columns = [
113+
col
114+
for col in columns
115+
if any(id_word in col.lower() for id_word in ["sku", "id", "code", "number", "upc", "ean", "isbn"])
116+
]
117+
118+
# Read CSV with specific data types
119+
df = pd.read_csv(
120+
file,
121+
sep="\t",
122+
dtype={
123+
**{col: str for col in columns}, # default all to string
124+
**{col: str for col in string_columns}, # ensure ID-like columns are strings
125+
**{
126+
col: "Int64" # Use nullable integer type
127+
for col in columns
128+
if ("Quantity" in col or "Qty" in col)
129+
and "Suggestion" not in col
130+
and col not in date_columns
131+
and col not in bool_columns
132+
and col not in string_columns # exclude string columns from integer conversion
133+
},
134+
**{
135+
col: float
136+
for col in columns
137+
if ("Cost" in col or "Price" in col)
138+
and "Competitor" not in col
139+
and "Dealer" not in col
140+
and "SalePrice" not in col
141+
and col not in date_columns
142+
and col not in bool_columns
143+
and col not in string_columns # exclude string columns from float conversion
144+
},
145+
},
146+
parse_dates=date_columns, # Convert date columns to datetime
147+
na_values=["", "nan", "NaN", "NULL"],
148+
keep_default_na=False,
149+
)
150+
151+
# Fill NA values based on column type
152+
for col in df.columns:
153+
if df[col].dtype == "Int64":
154+
df[col] = df[col].fillna(0)
155+
elif df[col].dtype == "float64":
156+
df[col] = df[col].fillna(0.0)
157+
elif col not in date_columns: # Don't fill NA in date columns
158+
df[col] = df[col].fillna("")
159+
160+
# Convert boolean columns after handling NA values
161+
for col in bool_columns:
162+
df[col] = df[col].map({"True": True, "False": False, "": False})
163+
164+
return df
165+
166+
167+
def transform_attributes(ca_catalog: pd.DataFrame) -> pd.DataFrame:
168+
# Create a list of attribute pairs (name and value columns)
169+
attribute_pairs = [(f"Attribute{i}Name", f"Attribute{i}Value") for i in range(1, 142)]
170+
171+
# Initialize an empty dictionary to store the transformed data
172+
transformed_data = {}
173+
174+
# Iterate through each attribute pair
175+
for name_col, value_col in attribute_pairs:
176+
# Get unique attribute names from the name column
177+
attr_names = ca_catalog[name_col].dropna().unique()
178+
179+
# For each unique attribute name, create a new column with the corresponding values
180+
for attr_name in attr_names:
181+
if pd.notna(attr_name): # Skip empty/null attribute names
182+
# Create column name with 'attr:' prefix
183+
new_col_name = f"attr:{attr_name}"
184+
# Get values where attribute name matches
185+
mask = ca_catalog[name_col] == attr_name
186+
transformed_data[new_col_name] = ca_catalog[value_col].where(mask, "").fillna("").astype(str)
187+
188+
# Create a new dataframe with the transformed attributes
189+
attr_df = pd.DataFrame(transformed_data)
190+
191+
# Combine with original dataframe (excluding the original attribute columns)
192+
original_cols = [col for col in ca_catalog.columns if not col.startswith("Attribute")]
193+
result_df = pd.concat([ca_catalog[original_cols], attr_df], axis=1)
194+
return result_df

notebooks/export.ipynb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"cells":[{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["import time\n","from channel_advisor_api.models.channel_advisor_export import ChannelAdvisorExport, transform_attributes\n","from dotenv import load_dotenv\n","\n","load_dotenv()\n","export = ChannelAdvisorExport()\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["\n","request = export.request_export()\n","print(request)\n","while not export.export_is_complete(request.token):\n"," print(\"Waiting for export to complete\")\n"," time.sleep(10)\n"," \n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["token = \"B729CF-473BFF\"\n","df = export.export_to_df(token)\n","df.columns.to_list()\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["\n","import pandas as pd\n","ca_catalog = transform_attributes(df)\n","print(ca_catalog.Sku.dtype)\n","ca_catalog[~ca_catalog.Sku.str.contains(\".\", na=False)]\n","# df.to_csv(\"ca_catalog.csv\")\n","# df = pd.read_csv(\"ca_catalog.csv\")\n","# df.Sku.dtype\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["df[~df.Sku.str.contains(\".\", na=False)]\n"]}],"metadata":{"kernelspec":{"display_name":".venv","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.1"}},"nbformat":4,"nbformat_minor":2}

0 commit comments

Comments
 (0)