Skip to content

Commit c056200

Browse files
authored
Merge pull request #41 from datamol-org/sanitize-sdf
better sanitize handling in read_sdf
2 parents e6225d6 + 46994af commit c056200

File tree

9 files changed

+249
-33
lines changed

9 files changed

+249
-33
lines changed

datamol/convert.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,7 @@ def from_df(
286286
smiles_column: Optional[str] = "smiles",
287287
mol_column: str = None,
288288
conserve_smiles: bool = False,
289+
sanitize: bool = True,
289290
) -> List[Chem.rdchem.Mol]:
290291
"""Convert a dataframe to a list of mols.
291292
@@ -300,6 +301,7 @@ def from_df(
300301
mol_column: Column name to extract the molecule. It takes
301302
precedence over `smiles_column`.
302303
conserve_smiles: Whether to conserve the SMILES in the mols' props.
304+
sanitize: Whether to sanitize if `smiles_column` is not None.
303305
"""
304306

305307
if smiles_column is None and mol_column is None:
@@ -308,12 +310,18 @@ def from_df(
308310
if len(df) == 0:
309311
return []
310312

313+
# Try to detect the mol column if `mol_column` is None.
314+
if mol_column is None:
315+
for col in df.columns:
316+
if isinstance(df[col].iloc[0], Chem.rdchem.Mol):
317+
mol_column = col
318+
311319
def _row_to_mol(row):
312320

313321
props = row.to_dict()
314322

315323
if mol_column is not None:
316-
mol = props[mol_column]
324+
mol = props.pop(mol_column)
317325
else:
318326

319327
if conserve_smiles:
@@ -323,7 +331,7 @@ def _row_to_mol(row):
323331
# properties.
324332
smiles = props.pop(smiles_column)
325333

326-
mol = dm.to_mol(smiles)
334+
mol = dm.to_mol(smiles, sanitize=sanitize)
327335

328336
if mol is None:
329337
return None

datamol/io.py

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def read_csv(
3939
df: a `pandas.DataFrame`
4040
"""
4141

42-
df = pd.read_csv(urlpath, **kwargs)
42+
df: pd.DataFrame = pd.read_csv(urlpath, **kwargs) # type: ignore
4343

4444
if smiles_column is not None:
4545
PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column)
@@ -78,48 +78,60 @@ def read_excel(
7878

7979
def read_sdf(
8080
urlpath: Union[str, os.PathLike, TextIO],
81+
sanitize: bool = True,
8182
as_df: bool = False,
8283
smiles_column: Optional[str] = "smiles",
8384
mol_column: str = None,
8485
include_private: bool = False,
8586
include_computed: bool = False,
86-
sanitize: bool = True,
8787
strict_parsing: bool = True,
8888
) -> Union[List[Chem.rdchem.Mol], pd.DataFrame]:
8989
"""Read an SDF file.
9090
91+
Note: This function is meant to be used with dataset that fit _in-memory_.
92+
For a more advanced usage we suggest you to use directly `Chem.ForwardSDMolSupplier`.
93+
9194
Args:
9295
urlpath: Path to a file or a file-like object. Path can be remote or local.
96+
sanitize: Whether to sanitize the molecules.
9397
as_df: Whether to return a list mol or a pandas DataFrame.
9498
smiles_column: Name of the SMILES column. Only relevant if `as_df` is True.
9599
mol_column: Name of the mol column. Only relevant if `as_df` is True.
96100
include_private: Include private properties in the columns. Only relevant if
97101
`as_df` is True.
98102
include_computed: Include computed properties in the columns. Only relevant if
99103
`as_df` is True.
100-
sanitize: Whether to sanitize the molecules
101104
strict_parsing: If set to false, the parser is more lax about correctness of the contents.
102105
"""
103106

104107
# File-like object
105108
if isinstance(urlpath, io.IOBase):
106-
supplier = Chem.ForwardSDMolSupplier(urlpath, sanitize=False, strictParsing=strict_parsing)
107-
mols = [mol for mol in supplier if mol is not None]
109+
supplier = Chem.ForwardSDMolSupplier(
110+
urlpath,
111+
sanitize=sanitize,
112+
strictParsing=strict_parsing,
113+
)
114+
mols = list(supplier)
108115

109116
# Regular local or remote paths
110117
else:
111118
with fsspec.open(urlpath) as f:
119+
120+
# Handle gzip file if needed
112121
if str(urlpath).endswith(".gz") or str(urlpath).endswith(".gzip"):
113122
f = gzip.open(f)
114-
supplier = Chem.ForwardSDMolSupplier(f, sanitize=False, strictParsing=strict_parsing)
115-
mols = [mol for mol in supplier if mol is not None]
116123

117-
if sanitize == True:
118-
mols_props = [
119-
(dm.sanitize_mol(mol), mol.GetPropsAsDict()) for mol in mols if mol is not None
120-
]
121-
mols = [dm.set_mol_props(mol, props) for mol, props in mols_props]
124+
supplier = Chem.ForwardSDMolSupplier(
125+
f,
126+
sanitize=sanitize,
127+
strictParsing=strict_parsing,
128+
)
129+
mols = list(supplier)
122130

131+
# Discard None values
132+
mols = [mol for mol in mols if mol is not None]
133+
134+
# Convert to dataframe
123135
if as_df:
124136
return dm.to_df(
125137
mols,
@@ -133,15 +145,15 @@ def read_sdf(
133145

134146

135147
def to_sdf(
136-
mols: Union[Sequence[Chem.rdchem.Mol], pd.DataFrame],
148+
mols: Union[Chem.rdchem.Mol, Sequence[Chem.rdchem.Mol], pd.DataFrame],
137149
urlpath: Union[str, os.PathLike, TextIO],
138150
smiles_column: Optional[str] = "smiles",
139151
mol_column: str = None,
140152
):
141153
"""Write molecules to a file.
142154
143155
Args:
144-
mols: a dataframe or a list of molecule.
156+
mols: a dataframe, a molecule or a list of molecule.
145157
urlpath: Path to a file or a file-like object. Path can be remote or local.
146158
smiles_column: Column name to extract the molecule.
147159
mol_column: Column name to extract the molecule. It takes
@@ -151,6 +163,9 @@ def to_sdf(
151163
if isinstance(mols, pd.DataFrame):
152164
mols = dm.from_df(mols, smiles_column=smiles_column, mol_column=mol_column)
153165

166+
elif isinstance(mols, Chem.rdchem.Mol):
167+
mols = [mols]
168+
154169
# Filter out None values
155170
mols = [mol for mol in mols if mol is not None]
156171

datamol/mol.py

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import copy
99
import random
1010

11+
from loguru import logger
12+
1113
from rdkit import Chem
1214
from rdkit.Chem import rdmolops
1315
from rdkit.Chem.MolStandardize import rdMolStandardize
@@ -72,7 +74,7 @@ def to_mol(
7274

7375
# Add hydrogens
7476
if _mol is not None and add_hs:
75-
_mol = Chem.AddHs(_mol, explicitOnly=explicit_only)
77+
_mol = Chem.AddHs(_mol, explicitOnly=explicit_only, addCoords=True)
7678

7779
# Reorder atoms
7880
if _mol is not None and ordered:
@@ -154,38 +156,67 @@ def to_neutral(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
154156

155157

156158
def sanitize_mol(
157-
mol: Chem.rdchem.Mol, charge_neutral: bool = False, sanifix: bool = True
159+
mol: Chem.rdchem.Mol,
160+
charge_neutral: bool = False,
161+
sanifix: bool = True,
162+
verbose: bool = True,
163+
add_hs: bool = False,
158164
) -> Optional[Chem.rdchem.Mol]:
159-
"""Sanitize molecule and fix common errors.
165+
"""An augmented version of RDKit `sanitize=True`. It uses a
166+
mol-SMILES-mol conversion to catch potential aromaticity errors
167+
and try to fix aromatic nitrogen (using the popular sanifix4 script).
168+
Optionally, it can neutralize the charge of the molecule.
160169
161-
Warning:
162-
The procedure includes a SMILES conversion to avoid accasional aromaticity
163-
errors. In consequence, all the properties and the conformers will be lost.
170+
Note #1: Only the first conformer (if present) will be preserved and
171+
a warning will be displayed if more than one conformer is detected.
172+
173+
Note #2: The molecule's properties will be preserved but the atom's
174+
properties will be lost.
164175
165176
Args:
166177
mol: a molecule.
167178
charge_neutral: whether charge neutralization should be applied.
168179
sanifix: whether to run the sanifix from James Davidson
169180
(sanifix4.py) that try to adjust aromatic nitrogens.
181+
verbose: Whether displaying a warning about multiple conformers.
182+
add_hs: Add hydrogens to the returned molecule. Useful when the input
183+
molecule already contains hydrogens.
170184
171185
Returns:
172186
mol: a molecule.
173187
"""
174188
if mol is None:
175189
return mol
176190

191+
# Extract properties.
192+
original_mol = copy_mol(mol)
193+
properties = original_mol.GetPropsAsDict()
194+
177195
if charge_neutral:
178196
mol = to_neutral(mol)
179197

180198
if sanifix:
181199
mol = _sanifix4.sanifix(mol)
182200

183-
if mol:
201+
if mol is not None:
202+
203+
# Detect multiple conformers
204+
if verbose and mol.GetNumConformers() > 1:
205+
logger.warning(
206+
f"The molecule contains multiple conformers. Only the first one will be preserved."
207+
)
208+
209+
# Try catch to avoid occasional aromaticity errors
184210
try:
185-
# Try catch to avoid occasional aromaticity errors
186-
return to_mol(dm.to_smiles(mol), sanitize=True) # type: ignore
211+
# `cxsmiles` is used here to preserve the first conformer.
212+
mol = to_mol(dm.to_smiles(mol, cxsmiles=True), sanitize=True, add_hs=add_hs) # type: ignore
187213
except Exception:
188-
return None
214+
mol = None
215+
216+
if mol is not None:
217+
# Insert back properties.
218+
mol = dm.set_mol_props(mol, properties)
219+
189220
return mol
190221

191222

@@ -276,7 +307,7 @@ def standardize_mol(
276307
Returns:
277308
mol: The standardized molecule.
278309
"""
279-
mol = copy.copy(mol)
310+
mol = copy_mol(mol)
280311

281312
if disconnect_metals:
282313
md = rdMolStandardize.MetalDisconnector()
@@ -584,7 +615,9 @@ def set_dative_bonds(
584615

585616

586617
def set_mol_props(
587-
mol: Chem.rdchem.Mol, props: Dict[str, Any], copy: bool = False
618+
mol: Chem.rdchem.Mol,
619+
props: Dict[str, Any],
620+
copy: bool = False,
588621
) -> Chem.rdchem.Mol:
589622
"""Set properties to a mol from a dict.
590623

env.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,6 @@ dependencies:
4444
- markdown-include
4545
- mdx_truly_sane_lists
4646
- mike >=1.0.0
47-
- markdown-it-py ==0.6.2 # no direct deps
48-
- mkdocs-autorefs =0.1 # no direct deps
4947

5048
# Releasing tools
5149
- rever >=0.4.5

news/sanitize-sdf.rst

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
**Added:**
2+
3+
* Add a sanitize flag to `from_df`.
4+
* Automatically detect the mol column in `from_df`.
5+
* Add `add_hs` arg to `sanitize_mol`.
6+
7+
**Changed:**
8+
9+
* Allow input a single molecule to `dm.to_sdf` instead of a list of mol.
10+
* Preserve mol properties and the frist conformer in `dm.sanitize_mol`.
11+
* Display a warning message when input mol has multiple conformers in `dm.sanitize_mol`.
12+
13+
**Deprecated:**
14+
15+
* <news item>
16+
17+
**Removed:**
18+
19+
* <news item>
20+
21+
**Fixed:**
22+
23+
* Remove call to `sanitize_mol` in `read_sdf`, instead use `sanitize=True` from RDKit.
24+
* Remove the `mol` column from the mol properties in `from_df`. It also fixes `to_sdf`.
25+
26+
**Security:**
27+
28+
* <news item>

tests/conftest.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
import platform
22
import pathlib
3+
from loguru import logger
34

45
import pytest
6+
from _pytest.logging import caplog as _caplog
7+
58

69
DATA_DIR_PATH = pathlib.Path(__file__).parent.resolve() / "data"
710

@@ -36,3 +39,20 @@ def pytest_configure(config):
3639
@pytest.fixture
3740
def datadir(request):
3841
return DATA_DIR_PATH
42+
43+
44+
@pytest.fixture
45+
def caplog(_caplog):
46+
"""Monkeypatching the pytest caplog to work with loguru.
47+
48+
See https://loguru.readthedocs.io/en/latest/resources/migration.html#making-things-work-with-pytest-and-caplog
49+
"""
50+
import logging
51+
52+
class PropogateHandler(logging.Handler):
53+
def emit(self, record):
54+
logging.getLogger(record.name).handle(record)
55+
56+
handler_id = logger.add(PropogateHandler(), format="{message}")
57+
yield _caplog
58+
logger.remove(handler_id)

tests/test_convert.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -169,11 +169,10 @@ def test_to_df_smiles_warning(datadir, caplog):
169169

170170
assert sum(df.columns == "smiles") == 2
171171

172-
for record in caplog.records:
173-
assert record.levelname != "WARNING"
172+
assert "WARNING" in caplog.text
174173
assert (
175174
"The SMILES column name provided ('smiles') is already present in the properties of the molecules"
176-
not in caplog.text
175+
in caplog.text
177176
)
178177

179178

@@ -190,3 +189,19 @@ def test_to_smiles_fail():
190189
# NOTE(hadim): ideally you want to catch only `Boost.Python.ArgumentError` here.
191190
with pytest.raises(Exception):
192191
dm.to_smiles(55, allow_to_fail=True)
192+
193+
194+
def test_from_df_pop_mol_column():
195+
df = dm.data.freesolv().iloc[:10] # type: ignore
196+
mols = [dm.to_mol(smiles) for smiles in df["smiles"]]
197+
198+
df: pd.DataFrame = dm.to_df(mols, mol_column="mol") # type: ignore
199+
df["dummy"] = "hello"
200+
201+
# test with provided mol column
202+
mols = dm.from_df(df.copy(), mol_column="mol")
203+
assert set(mols[0].GetPropsAsDict().keys()) == {"smiles", "dummy"}
204+
205+
# test with automatic mol column detection
206+
mols = dm.from_df(df.copy())
207+
assert set(mols[0].GetPropsAsDict().keys()) == {"smiles", "dummy"}

0 commit comments

Comments
 (0)