Skip to content

Commit b5af897

Browse files
committed
add clean_language function
1 parent c735cd9 commit b5af897

File tree

2 files changed

+8200
-0
lines changed

2 files changed

+8200
-0
lines changed

dataprep/clean/clean_language.py

Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
"""
2+
Clean and validate a DataFrame column containing language.
3+
"""
4+
5+
# pylint: disable=too-many-arguments, global-statement
6+
7+
from os import path
8+
from typing import Any, Union, Tuple, Optional
9+
10+
import dask
11+
import dask.dataframe as dd
12+
import numpy as np
13+
import pandas as pd
14+
15+
from ..progress_bar import ProgressBar
16+
from .utils import NULL_VALUES, to_dask
17+
18+
DEFAULT_LANGUAGE_DATA_FILE = path.join(path.split(path.abspath(__file__))[0], "language_data.csv")
19+
20+
DATA = pd.read_csv(DEFAULT_LANGUAGE_DATA_FILE, dtype=str)
21+
22+
23+
def clean_language(
24+
df: Union[pd.DataFrame, dd.DataFrame],
25+
col: str,
26+
input_format: Union[str, Tuple[str, ...]] = "auto",
27+
output_format: str = "name",
28+
kb_path: str = "default",
29+
inplace: bool = False,
30+
errors: str = "coerce",
31+
progress: bool = True,
32+
) -> pd.DataFrame:
33+
"""
34+
Clean language type data in a DataFrame column.
35+
36+
Parameters
37+
----------
38+
df
39+
A pandas or Dask DataFrame containing the data to be cleaned.
40+
col
41+
The name of the column containing data of language type.
42+
input_format
43+
The ISO 639 input format of the language.
44+
- 'auto': infer the input format
45+
- 'name': language name ('English')
46+
- 'alpha-2': alpha-2 code ('en')
47+
- 'alpha-3': alpha-3 code ('eng')
48+
49+
Can also be a tuple containing any combination of input formats,
50+
for example to clean a column containing name and alpha-2
51+
codes set input_format to ('name', 'alpha-2').
52+
53+
(default: 'auto')
54+
output_format
55+
The desired ISO 639 format of the language.
56+
- 'name': language name ('English')
57+
- 'alpha-2': alpha-2 code ('en')
58+
- 'alpha-3': alpha-3 code ('eng')
59+
60+
(default: 'name')
61+
kb_path
62+
The path of user specified knowledge base.
63+
In current stage, it should be in the user's local directory
64+
following by the format we proposing.
65+
66+
(default: 'default')
67+
inplace
68+
If True, delete the column containing the data that was cleaned.
69+
Otherwise, keep the original column.
70+
71+
(default: False)
72+
errors
73+
How to handle parsing errors.
74+
- 'coerce': invalid parsing will be set to NaN.
75+
- 'ignore': invalid parsing will return the input.
76+
- 'raise': invalid parsing will raise an exception.
77+
78+
(default: 'coerce')
79+
progress
80+
If True, display a progress bar.
81+
82+
(default: True)
83+
84+
Examples
85+
--------
86+
Clean a column of language data.
87+
88+
>>> df = pd.DataFrame({'language': ['eng', 'zh', 'Japanese']})
89+
>>> clean_language(df, 'language')
90+
language language_clean
91+
0 eng English
92+
1 zh Chinese
93+
2 Japanese Japanese
94+
"""
95+
# load knowledge base
96+
_load_kb(kb_path)
97+
98+
valid_output_formats = {"name", "alpha-2", "alpha-3"}
99+
if output_format not in valid_output_formats:
100+
raise ValueError(
101+
f'output_format {output_format} is invalid, it needs to be "name", '
102+
'"alpha-2" or "alpha-3"'
103+
)
104+
input_formats = _convert_format_to_tuple(input_format)
105+
106+
# convert to dask
107+
if isinstance(df, pd.DataFrame):
108+
df = to_dask(df)
109+
110+
df[f"{col}_clean"] = df[col].map_partitions(
111+
lambda srs: [_format_language(x, input_formats, output_format, errors) for x in srs],
112+
meta=object,
113+
)
114+
115+
with ProgressBar(minimum=0, disable=not progress):
116+
df = dask.compute(df)[0]
117+
118+
if inplace:
119+
df = df.drop(columns=col)
120+
121+
return df
122+
123+
124+
def validate_language(
125+
x: Union[str, pd.Series, dd.Series, pd.DataFrame, dd.DataFrame],
126+
col: Optional[str] = None,
127+
input_format: Union[str, Tuple[str, ...]] = "auto",
128+
kb_path: str = "default",
129+
) -> Union[bool, pd.Series, pd.DataFrame]:
130+
"""
131+
Validate language type data in a DataFrame column. For each cell, return True or False.
132+
133+
Parameters
134+
----------
135+
x
136+
Language data to be validated. It could be a single string, or
137+
a pandas or Dask DataFrame (with the parameter `col` to specify
138+
the column containing language data), or a pandas or Dask Series.
139+
col
140+
The name of the column to be validated.
141+
If x is not a pandas or Dask DataFrame, it would be ignored.
142+
If x is a pandas or Dask DataFrame but `col` is not specified,
143+
then the whole dataframe will be validated.
144+
145+
(default: None)
146+
input_format
147+
The ISO 639 input format of the language.
148+
- 'auto': infer the input format
149+
- 'name': language name ('English')
150+
- 'alpha-2': alpha-2 code ('en')
151+
- 'alpha-3': alpha-3 code ('eng')
152+
153+
Can also be a tuple containing any combination of input formats,
154+
for example to clean a column containing name and alpha-2
155+
codes set input_format to ('name', 'alpha-2').
156+
157+
(default: 'auto')
158+
kb_path
159+
The path of user specified knowledge base.
160+
In current stage, it should be in the user's local directory
161+
following by the format we proposing.
162+
163+
(default: "default")
164+
"""
165+
# load knowledge base
166+
_load_kb(kb_path)
167+
168+
input_formats = _convert_format_to_tuple(input_format)
169+
170+
if isinstance(x, str):
171+
return _check_language(x, input_formats, False)
172+
173+
if isinstance(x, pd.Series):
174+
return x.apply(_check_language, args=(input_formats, False))
175+
176+
if isinstance(x, dd.Series):
177+
res = x.map_partitions(
178+
lambda srs: [_check_language(val, input_formats, False) for val in srs], meta=bool
179+
)
180+
return pd.Series(dask.compute(res)[0][0], name=x.name) # extract twice to get a list
181+
182+
if isinstance(x, pd.DataFrame):
183+
x = to_dask(x)
184+
185+
if isinstance(x, dd.DataFrame):
186+
if col is not None:
187+
res = x[col].map_partitions(
188+
lambda srs: [_check_language(val, input_formats, False) for val in srs], meta=bool
189+
)
190+
return pd.Series(dask.compute(res)[0][0], name=col)
191+
else:
192+
# validate the whole dataframe and return pd.Dataframe.
193+
res_df = pd.DataFrame()
194+
for col_t in x.columns:
195+
res = x[col_t].map_partitions(
196+
lambda srs: [_check_language(val, input_formats, False) for val in srs],
197+
meta=bool,
198+
)
199+
res_df[col_t] = pd.Series(dask.compute(res)[0][0], name=col_t)
200+
return res_df
201+
else:
202+
raise TypeError("must be str, Series or DataFrame")
203+
204+
205+
def _format_language(
206+
val: str, input_formats: Tuple[str, ...], output_format: str, errors: str
207+
) -> Any:
208+
"""
209+
Reformat a language string with proper output format.
210+
"""
211+
result_index, status = _check_language(val, input_formats, True)
212+
213+
if status == "null":
214+
return np.nan
215+
if status == "unknown":
216+
if errors == "raise":
217+
raise ValueError(f"unable to parse value {val}")
218+
return val if errors == "ignore" else np.nan
219+
220+
result = DATA.loc[result_index, output_format]
221+
if pd.isna(result):
222+
# country doesn't have the required output format
223+
if errors == "raise":
224+
raise ValueError(f"unable to parse value {val}")
225+
return val if errors == "ignore" else np.nan
226+
227+
return result.title() if output_format == "name" else result
228+
229+
230+
def _check_language(val: str, input_formats: Tuple[str, ...], clean: bool) -> Any:
231+
"""
232+
Find the index of the given language string in the DATA dataframe.
233+
234+
Parameters
235+
----------
236+
val
237+
String containing the language value to be cleaned.
238+
input_formats
239+
Tuple containing potential ISO 639 input formats of the language.
240+
clean
241+
If True, a tuple (index, status) is returned. There are 3 status:
242+
- "null": val is a null value.
243+
- "unknown": val could not be parsed.
244+
- "success": a successful parse of the value.
245+
If False, the function returns True/False to be used by the validate function.
246+
"""
247+
if val in NULL_VALUES:
248+
return (None, "null") if clean else False
249+
250+
val = str(val).lower().strip()
251+
252+
for fmt in input_formats:
253+
try:
254+
ind = DATA.loc[DATA[fmt].str.lower() == val].index[0]
255+
except IndexError:
256+
continue
257+
else:
258+
return (ind, "success") if clean else True
259+
260+
return (None, "unknown") if clean else False
261+
262+
263+
def _load_kb(
264+
kb_path: str,
265+
# encode: Optional[str] = None
266+
) -> Any:
267+
"""
268+
Load knowledge base from a specified path.
269+
"""
270+
global DATA
271+
if kb_path == "default":
272+
DATA = pd.read_csv(DEFAULT_LANGUAGE_DATA_FILE, dtype=str)
273+
else:
274+
DATA = pd.read_csv(kb_path, dtype=str)
275+
# check whether the format of the knowledge base is valid
276+
valid_formats = {"name", "alpha-2", "alpha-3"}
277+
for fmt in valid_formats:
278+
if fmt not in DATA.columns:
279+
raise KeyError(
280+
"knowledge base does not follow the format, "
281+
'it needs to contain "name", "alpha-2", and "alpha-3"'
282+
)
283+
284+
285+
def _convert_format_to_tuple(input_format: Union[str, Tuple[str, ...]]) -> Tuple[str, ...]:
286+
"""
287+
Converts a string input format to a tuple of allowed input formats and
288+
raises an error if an input format is not valid.
289+
"""
290+
if isinstance(input_format, str):
291+
if input_format == "auto":
292+
return ("name", "alpha-2", "alpha-3")
293+
else:
294+
input_format = (input_format,)
295+
296+
valid_input_formats = {"auto", "name", "alpha-2", "alpha-3"}
297+
for fmt in input_format:
298+
if fmt not in valid_input_formats:
299+
raise ValueError(
300+
f'input_format {fmt} is invalid, it needs to be one of "auto", '
301+
'"name", "alpha-2" or "alpha-3"'
302+
)
303+
if "auto" in input_format:
304+
return ("name", "alpha-2", "alpha-3")
305+
306+
return input_format

0 commit comments

Comments
 (0)