|
| 1 | +""" |
| 2 | +Clean and validate a DataFrame column containing language. |
| 3 | +""" |
| 4 | + |
| 5 | +# pylint: disable=too-many-arguments, global-statement |
| 6 | + |
| 7 | +from os import path |
| 8 | +from typing import Any, Union, Tuple, Optional |
| 9 | + |
| 10 | +import dask |
| 11 | +import dask.dataframe as dd |
| 12 | +import numpy as np |
| 13 | +import pandas as pd |
| 14 | + |
| 15 | +from ..progress_bar import ProgressBar |
| 16 | +from .utils import NULL_VALUES, to_dask |
| 17 | + |
| 18 | +DEFAULT_LANGUAGE_DATA_FILE = path.join(path.split(path.abspath(__file__))[0], "language_data.csv") |
| 19 | + |
| 20 | +DATA = pd.read_csv(DEFAULT_LANGUAGE_DATA_FILE, dtype=str) |
| 21 | + |
| 22 | + |
| 23 | +def clean_language( |
| 24 | + df: Union[pd.DataFrame, dd.DataFrame], |
| 25 | + col: str, |
| 26 | + input_format: Union[str, Tuple[str, ...]] = "auto", |
| 27 | + output_format: str = "name", |
| 28 | + kb_path: str = "default", |
| 29 | + inplace: bool = False, |
| 30 | + errors: str = "coerce", |
| 31 | + progress: bool = True, |
| 32 | +) -> pd.DataFrame: |
| 33 | + """ |
| 34 | + Clean language type data in a DataFrame column. |
| 35 | +
|
| 36 | + Parameters |
| 37 | + ---------- |
| 38 | + df |
| 39 | + A pandas or Dask DataFrame containing the data to be cleaned. |
| 40 | + col |
| 41 | + The name of the column containing data of language type. |
| 42 | + input_format |
| 43 | + The ISO 639 input format of the language. |
| 44 | + - 'auto': infer the input format |
| 45 | + - 'name': language name ('English') |
| 46 | + - 'alpha-2': alpha-2 code ('en') |
| 47 | + - 'alpha-3': alpha-3 code ('eng') |
| 48 | +
|
| 49 | + Can also be a tuple containing any combination of input formats, |
| 50 | + for example to clean a column containing name and alpha-2 |
| 51 | + codes set input_format to ('name', 'alpha-2'). |
| 52 | +
|
| 53 | + (default: 'auto') |
| 54 | + output_format |
| 55 | + The desired ISO 639 format of the language. |
| 56 | + - 'name': language name ('English') |
| 57 | + - 'alpha-2': alpha-2 code ('en') |
| 58 | + - 'alpha-3': alpha-3 code ('eng') |
| 59 | +
|
| 60 | + (default: 'name') |
| 61 | + kb_path |
| 62 | + The path of user specified knowledge base. |
| 63 | + In current stage, it should be in the user's local directory |
| 64 | + following by the format we proposing. |
| 65 | +
|
| 66 | + (default: 'default') |
| 67 | + inplace |
| 68 | + If True, delete the column containing the data that was cleaned. |
| 69 | + Otherwise, keep the original column. |
| 70 | +
|
| 71 | + (default: False) |
| 72 | + errors |
| 73 | + How to handle parsing errors. |
| 74 | + - 'coerce': invalid parsing will be set to NaN. |
| 75 | + - 'ignore': invalid parsing will return the input. |
| 76 | + - 'raise': invalid parsing will raise an exception. |
| 77 | +
|
| 78 | + (default: 'coerce') |
| 79 | + progress |
| 80 | + If True, display a progress bar. |
| 81 | +
|
| 82 | + (default: True) |
| 83 | +
|
| 84 | + Examples |
| 85 | + -------- |
| 86 | + Clean a column of language data. |
| 87 | +
|
| 88 | + >>> df = pd.DataFrame({'language': ['eng', 'zh', 'Japanese']}) |
| 89 | + >>> clean_language(df, 'language') |
| 90 | + language language_clean |
| 91 | + 0 eng English |
| 92 | + 1 zh Chinese |
| 93 | + 2 Japanese Japanese |
| 94 | + """ |
| 95 | + # load knowledge base |
| 96 | + _load_kb(kb_path) |
| 97 | + |
| 98 | + valid_output_formats = {"name", "alpha-2", "alpha-3"} |
| 99 | + if output_format not in valid_output_formats: |
| 100 | + raise ValueError( |
| 101 | + f'output_format {output_format} is invalid, it needs to be "name", ' |
| 102 | + '"alpha-2" or "alpha-3"' |
| 103 | + ) |
| 104 | + input_formats = _convert_format_to_tuple(input_format) |
| 105 | + |
| 106 | + # convert to dask |
| 107 | + if isinstance(df, pd.DataFrame): |
| 108 | + df = to_dask(df) |
| 109 | + |
| 110 | + df[f"{col}_clean"] = df[col].map_partitions( |
| 111 | + lambda srs: [_format_language(x, input_formats, output_format, errors) for x in srs], |
| 112 | + meta=object, |
| 113 | + ) |
| 114 | + |
| 115 | + with ProgressBar(minimum=0, disable=not progress): |
| 116 | + df = dask.compute(df)[0] |
| 117 | + |
| 118 | + if inplace: |
| 119 | + df = df.drop(columns=col) |
| 120 | + |
| 121 | + return df |
| 122 | + |
| 123 | + |
| 124 | +def validate_language( |
| 125 | + x: Union[str, pd.Series, dd.Series, pd.DataFrame, dd.DataFrame], |
| 126 | + col: Optional[str] = None, |
| 127 | + input_format: Union[str, Tuple[str, ...]] = "auto", |
| 128 | + kb_path: str = "default", |
| 129 | +) -> Union[bool, pd.Series, pd.DataFrame]: |
| 130 | + """ |
| 131 | + Validate language type data in a DataFrame column. For each cell, return True or False. |
| 132 | +
|
| 133 | + Parameters |
| 134 | + ---------- |
| 135 | + x |
| 136 | + Language data to be validated. It could be a single string, or |
| 137 | + a pandas or Dask DataFrame (with the parameter `col` to specify |
| 138 | + the column containing language data), or a pandas or Dask Series. |
| 139 | + col |
| 140 | + The name of the column to be validated. |
| 141 | + If x is not a pandas or Dask DataFrame, it would be ignored. |
| 142 | + If x is a pandas or Dask DataFrame but `col` is not specified, |
| 143 | + then the whole dataframe will be validated. |
| 144 | +
|
| 145 | + (default: None) |
| 146 | + input_format |
| 147 | + The ISO 639 input format of the language. |
| 148 | + - 'auto': infer the input format |
| 149 | + - 'name': language name ('English') |
| 150 | + - 'alpha-2': alpha-2 code ('en') |
| 151 | + - 'alpha-3': alpha-3 code ('eng') |
| 152 | +
|
| 153 | + Can also be a tuple containing any combination of input formats, |
| 154 | + for example to clean a column containing name and alpha-2 |
| 155 | + codes set input_format to ('name', 'alpha-2'). |
| 156 | +
|
| 157 | + (default: 'auto') |
| 158 | + kb_path |
| 159 | + The path of user specified knowledge base. |
| 160 | + In current stage, it should be in the user's local directory |
| 161 | + following by the format we proposing. |
| 162 | +
|
| 163 | + (default: "default") |
| 164 | + """ |
| 165 | + # load knowledge base |
| 166 | + _load_kb(kb_path) |
| 167 | + |
| 168 | + input_formats = _convert_format_to_tuple(input_format) |
| 169 | + |
| 170 | + if isinstance(x, str): |
| 171 | + return _check_language(x, input_formats, False) |
| 172 | + |
| 173 | + if isinstance(x, pd.Series): |
| 174 | + return x.apply(_check_language, args=(input_formats, False)) |
| 175 | + |
| 176 | + if isinstance(x, dd.Series): |
| 177 | + res = x.map_partitions( |
| 178 | + lambda srs: [_check_language(val, input_formats, False) for val in srs], meta=bool |
| 179 | + ) |
| 180 | + return pd.Series(dask.compute(res)[0][0], name=x.name) # extract twice to get a list |
| 181 | + |
| 182 | + if isinstance(x, pd.DataFrame): |
| 183 | + x = to_dask(x) |
| 184 | + |
| 185 | + if isinstance(x, dd.DataFrame): |
| 186 | + if col is not None: |
| 187 | + res = x[col].map_partitions( |
| 188 | + lambda srs: [_check_language(val, input_formats, False) for val in srs], meta=bool |
| 189 | + ) |
| 190 | + return pd.Series(dask.compute(res)[0][0], name=col) |
| 191 | + else: |
| 192 | + # validate the whole dataframe and return pd.Dataframe. |
| 193 | + res_df = pd.DataFrame() |
| 194 | + for col_t in x.columns: |
| 195 | + res = x[col_t].map_partitions( |
| 196 | + lambda srs: [_check_language(val, input_formats, False) for val in srs], |
| 197 | + meta=bool, |
| 198 | + ) |
| 199 | + res_df[col_t] = pd.Series(dask.compute(res)[0][0], name=col_t) |
| 200 | + return res_df |
| 201 | + else: |
| 202 | + raise TypeError("must be str, Series or DataFrame") |
| 203 | + |
| 204 | + |
| 205 | +def _format_language( |
| 206 | + val: str, input_formats: Tuple[str, ...], output_format: str, errors: str |
| 207 | +) -> Any: |
| 208 | + """ |
| 209 | + Reformat a language string with proper output format. |
| 210 | + """ |
| 211 | + result_index, status = _check_language(val, input_formats, True) |
| 212 | + |
| 213 | + if status == "null": |
| 214 | + return np.nan |
| 215 | + if status == "unknown": |
| 216 | + if errors == "raise": |
| 217 | + raise ValueError(f"unable to parse value {val}") |
| 218 | + return val if errors == "ignore" else np.nan |
| 219 | + |
| 220 | + result = DATA.loc[result_index, output_format] |
| 221 | + if pd.isna(result): |
| 222 | + # country doesn't have the required output format |
| 223 | + if errors == "raise": |
| 224 | + raise ValueError(f"unable to parse value {val}") |
| 225 | + return val if errors == "ignore" else np.nan |
| 226 | + |
| 227 | + return result.title() if output_format == "name" else result |
| 228 | + |
| 229 | + |
| 230 | +def _check_language(val: str, input_formats: Tuple[str, ...], clean: bool) -> Any: |
| 231 | + """ |
| 232 | + Find the index of the given language string in the DATA dataframe. |
| 233 | +
|
| 234 | + Parameters |
| 235 | + ---------- |
| 236 | + val |
| 237 | + String containing the language value to be cleaned. |
| 238 | + input_formats |
| 239 | + Tuple containing potential ISO 639 input formats of the language. |
| 240 | + clean |
| 241 | + If True, a tuple (index, status) is returned. There are 3 status: |
| 242 | + - "null": val is a null value. |
| 243 | + - "unknown": val could not be parsed. |
| 244 | + - "success": a successful parse of the value. |
| 245 | + If False, the function returns True/False to be used by the validate function. |
| 246 | + """ |
| 247 | + if val in NULL_VALUES: |
| 248 | + return (None, "null") if clean else False |
| 249 | + |
| 250 | + val = str(val).lower().strip() |
| 251 | + |
| 252 | + for fmt in input_formats: |
| 253 | + try: |
| 254 | + ind = DATA.loc[DATA[fmt].str.lower() == val].index[0] |
| 255 | + except IndexError: |
| 256 | + continue |
| 257 | + else: |
| 258 | + return (ind, "success") if clean else True |
| 259 | + |
| 260 | + return (None, "unknown") if clean else False |
| 261 | + |
| 262 | + |
| 263 | +def _load_kb( |
| 264 | + kb_path: str, |
| 265 | + # encode: Optional[str] = None |
| 266 | +) -> Any: |
| 267 | + """ |
| 268 | + Load knowledge base from a specified path. |
| 269 | + """ |
| 270 | + global DATA |
| 271 | + if kb_path == "default": |
| 272 | + DATA = pd.read_csv(DEFAULT_LANGUAGE_DATA_FILE, dtype=str) |
| 273 | + else: |
| 274 | + DATA = pd.read_csv(kb_path, dtype=str) |
| 275 | + # check whether the format of the knowledge base is valid |
| 276 | + valid_formats = {"name", "alpha-2", "alpha-3"} |
| 277 | + for fmt in valid_formats: |
| 278 | + if fmt not in DATA.columns: |
| 279 | + raise KeyError( |
| 280 | + "knowledge base does not follow the format, " |
| 281 | + 'it needs to contain "name", "alpha-2", and "alpha-3"' |
| 282 | + ) |
| 283 | + |
| 284 | + |
| 285 | +def _convert_format_to_tuple(input_format: Union[str, Tuple[str, ...]]) -> Tuple[str, ...]: |
| 286 | + """ |
| 287 | + Converts a string input format to a tuple of allowed input formats and |
| 288 | + raises an error if an input format is not valid. |
| 289 | + """ |
| 290 | + if isinstance(input_format, str): |
| 291 | + if input_format == "auto": |
| 292 | + return ("name", "alpha-2", "alpha-3") |
| 293 | + else: |
| 294 | + input_format = (input_format,) |
| 295 | + |
| 296 | + valid_input_formats = {"auto", "name", "alpha-2", "alpha-3"} |
| 297 | + for fmt in input_format: |
| 298 | + if fmt not in valid_input_formats: |
| 299 | + raise ValueError( |
| 300 | + f'input_format {fmt} is invalid, it needs to be one of "auto", ' |
| 301 | + '"name", "alpha-2" or "alpha-3"' |
| 302 | + ) |
| 303 | + if "auto" in input_format: |
| 304 | + return ("name", "alpha-2", "alpha-3") |
| 305 | + |
| 306 | + return input_format |
0 commit comments