cloudshiftstrategies
diff --git a/‎channel_advisor_api/models/channel_advisor_export.py
Lines changed: 119 additions & 1 deletion b/‎channel_advisor_api/models/channel_advisor_export.py
Lines changed: 119 additions & 1 deletion
diff --git a/‎notebooks/export.ipynb
Lines changed: 1 addition & 0 deletions b/‎notebooks/export.ipynb
Lines changed: 1 addition & 0 deletions
@@ -73,4 +73,122 @@ def export_to_df(self, token: str) -> pd.DataFrame:
                 raise ValueError("No .txt file found in zip archive")
 
             with zip_ref.open(txt_files[0]) as file:
-                return pd.read_csv(file, sep="\t")
+                # First read to get column names
+                header_df = pd.read_csv(file, sep="\t", nrows=0)
+                columns = header_df.columns
+
+                # Reset file pointer to beginning
+                file.seek(0)
+
+                # Identify datetime columns
+                date_columns = [
+                    col
+                    for col in columns
+                    if any(time_word in col.lower() for time_word in ["date", "time", "created", "modified", "updated"])
+                ]
+
+                # Identify boolean columns (common boolean column names)
+                bool_columns = [
+                    col
+                    for col in columns
+                    if any(
+                        bool_word in col.lower()
+                        for bool_word in ["is", "has", "can", "allow", "enabled", "active", "visible"]
+                    )
+                ]
+
+                # First read everything as string to identify columns with boolean values
+                temp_df = pd.read_csv(file, sep="\t", dtype=str, nrows=1000)
+                file.seek(0)  # Reset file pointer again
+
+                # Find additional boolean columns by checking content
+                for col in columns:
+                    if col not in bool_columns:
+                        unique_vals = set(temp_df[col].dropna().unique())
+                        if unique_vals.issubset({"True", "False", ""}) and len(unique_vals) > 0:
+                            bool_columns.append(col)
+
+                # Identify columns that must always be strings
+                string_columns = [
+                    col
+                    for col in columns
+                    if any(id_word in col.lower() for id_word in ["sku", "id", "code", "number", "upc", "ean", "isbn"])
+                ]
+
+                # Read CSV with specific data types
+                df = pd.read_csv(
+                    file,
+                    sep="\t",
+                    dtype={
+                        **{col: str for col in columns},  # default all to string
+                        **{col: str for col in string_columns},  # ensure ID-like columns are strings
+                        **{
+                            col: "Int64"  # Use nullable integer type
+                            for col in columns
+                            if ("Quantity" in col or "Qty" in col)
+                            and "Suggestion" not in col
+                            and col not in date_columns
+                            and col not in bool_columns
+                            and col not in string_columns  # exclude string columns from integer conversion
+                        },
+                        **{
+                            col: float
+                            for col in columns
+                            if ("Cost" in col or "Price" in col)
+                            and "Competitor" not in col
+                            and "Dealer" not in col
+                            and "SalePrice" not in col
+                            and col not in date_columns
+                            and col not in bool_columns
+                            and col not in string_columns  # exclude string columns from float conversion
+                        },
+                    },
+                    parse_dates=date_columns,  # Convert date columns to datetime
+                    na_values=["", "nan", "NaN", "NULL"],
+                    keep_default_na=False,
+                )
+
+                # Fill NA values based on column type
+                for col in df.columns:
+                    if df[col].dtype == "Int64":
+                        df[col] = df[col].fillna(0)
+                    elif df[col].dtype == "float64":
+                        df[col] = df[col].fillna(0.0)
+                    elif col not in date_columns:  # Don't fill NA in date columns
+                        df[col] = df[col].fillna("")
+
+                # Convert boolean columns after handling NA values
+                for col in bool_columns:
+                    df[col] = df[col].map({"True": True, "False": False, "": False})
+
+                return df
+
+
+def transform_attributes(ca_catalog: pd.DataFrame) -> pd.DataFrame:
+    # Create a list of attribute pairs (name and value columns)
+    attribute_pairs = [(f"Attribute{i}Name", f"Attribute{i}Value") for i in range(1, 142)]
+
+    # Initialize an empty dictionary to store the transformed data
+    transformed_data = {}
+
+    # Iterate through each attribute pair
+    for name_col, value_col in attribute_pairs:
+        # Get unique attribute names from the name column
+        attr_names = ca_catalog[name_col].dropna().unique()
+
+        # For each unique attribute name, create a new column with the corresponding values
+        for attr_name in attr_names:
+            if pd.notna(attr_name):  # Skip empty/null attribute names
+                # Create column name with 'attr:' prefix
+                new_col_name = f"attr:{attr_name}"
+                # Get values where attribute name matches
+                mask = ca_catalog[name_col] == attr_name
+                transformed_data[new_col_name] = ca_catalog[value_col].where(mask, "").fillna("").astype(str)
+
+    # Create a new dataframe with the transformed attributes
+    attr_df = pd.DataFrame(transformed_data)
+
+    # Combine with original dataframe (excluding the original attribute columns)
+    original_cols = [col for col in ca_catalog.columns if not col.startswith("Attribute")]
+    result_df = pd.concat([ca_catalog[original_cols], attr_df], axis=1)
+    return result_df
@@ -0,0 +1 @@
+{"cells":[{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["import time\n","from channel_advisor_api.models.channel_advisor_export import ChannelAdvisorExport, transform_attributes\n","from dotenv import load_dotenv\n","\n","load_dotenv()\n","export = ChannelAdvisorExport()\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["\n","request = export.request_export()\n","print(request)\n","while not export.export_is_complete(request.token):\n","    print(\"Waiting for export to complete\")\n","    time.sleep(10)\n","    \n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["token = \"B729CF-473BFF\"\n","df = export.export_to_df(token)\n","df.columns.to_list()\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["\n","import pandas as pd\n","ca_catalog = transform_attributes(df)\n","print(ca_catalog.Sku.dtype)\n","ca_catalog[~ca_catalog.Sku.str.contains(\".\", na=False)]\n","# df.to_csv(\"ca_catalog.csv\")\n","# df = pd.read_csv(\"ca_catalog.csv\")\n","# df.Sku.dtype\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["df[~df.Sku.str.contains(\".\", na=False)]\n"]}],"metadata":{"kernelspec":{"display_name":".venv","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.1"}},"nbformat":4,"nbformat_minor":2}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"cells":[{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["import time\n","from channel_advisor_api.models.channel_advisor_export import ChannelAdvisorExport, transform_attributes\n","from dotenv import load_dotenv\n","\n","load_dotenv()\n","export = ChannelAdvisorExport()\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["\n","request = export.request_export()\n","print(request)\n","while not export.export_is_complete(request.token):\n"," print(\"Waiting for export to complete\")\n"," time.sleep(10)\n"," \n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["token = \"B729CF-473BFF\"\n","df = export.export_to_df(token)\n","df.columns.to_list()\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["\n","import pandas as pd\n","ca_catalog = transform_attributes(df)\n","print(ca_catalog.Sku.dtype)\n","ca_catalog[~ca_catalog.Sku.str.contains(\".\", na=False)]\n","# df.to_csv(\"ca_catalog.csv\")\n","# df = pd.read_csv(\"ca_catalog.csv\")\n","# df.Sku.dtype\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["df[~df.Sku.str.contains(\".\", na=False)]\n"]}],"metadata":{"kernelspec":{"display_name":".venv","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.1"}},"nbformat":4,"nbformat_minor":2}