Merge pull request #41 from datamol-org/sanitize-sdf

hadim · web-flow · commit c056200abaa7 · 2021-06-10T19:06:43.000-04:00
better sanitize handling in read_sdf
diff --git a/datamol/convert.py b/datamol/convert.py
@@ -286,6 +286,7 @@ def from_df(
     smiles_column: Optional[str] = "smiles",
     mol_column: str = None,
     conserve_smiles: bool = False,
+    sanitize: bool = True,
 ) -> List[Chem.rdchem.Mol]:
     """Convert a dataframe to a list of mols.
 
@@ -300,6 +301,7 @@ def from_df(
         mol_column: Column name to extract the molecule. It takes
             precedence over `smiles_column`.
         conserve_smiles: Whether to conserve the SMILES in the mols' props.
+        sanitize: Whether to sanitize if `smiles_column` is not None.
     """
 
     if smiles_column is None and mol_column is None:
@@ -308,12 +310,18 @@ def from_df(
     if len(df) == 0:
         return []
 
+    # Try to detect the mol column if `mol_column` is None.
+    if mol_column is None:
+        for col in df.columns:
+            if isinstance(df[col].iloc[0], Chem.rdchem.Mol):
+                mol_column = col
+
     def _row_to_mol(row):
 
         props = row.to_dict()
 
         if mol_column is not None:
-            mol = props[mol_column]
+            mol = props.pop(mol_column)
         else:
 
             if conserve_smiles:
@@ -323,7 +331,7 @@ def _row_to_mol(row):
                 # properties.
                 smiles = props.pop(smiles_column)
 
-            mol = dm.to_mol(smiles)
+            mol = dm.to_mol(smiles, sanitize=sanitize)
 
         if mol is None:
             return None
diff --git a/datamol/io.py b/datamol/io.py
@@ -39,7 +39,7 @@ def read_csv(
         df: a `pandas.DataFrame`
     """
 
-    df = pd.read_csv(urlpath, **kwargs)
+    df: pd.DataFrame = pd.read_csv(urlpath, **kwargs)  # type: ignore
 
     if smiles_column is not None:
         PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column)
@@ -78,48 +78,60 @@ def read_excel(
 
 def read_sdf(
     urlpath: Union[str, os.PathLike, TextIO],
+    sanitize: bool = True,
     as_df: bool = False,
     smiles_column: Optional[str] = "smiles",
     mol_column: str = None,
     include_private: bool = False,
     include_computed: bool = False,
-    sanitize: bool = True,
     strict_parsing: bool = True,
 ) -> Union[List[Chem.rdchem.Mol], pd.DataFrame]:
     """Read an SDF file.
 
+    Note: This function is meant to be used with dataset that fit _in-memory_.
+    For a more advanced usage we suggest you to use directly `Chem.ForwardSDMolSupplier`.
+
     Args:
         urlpath: Path to a file or a file-like object. Path can be remote or local.
+        sanitize: Whether to sanitize the molecules.
         as_df: Whether to return a list mol or a pandas DataFrame.
         smiles_column: Name of the SMILES column. Only relevant if `as_df` is True.
         mol_column: Name of the mol column. Only relevant if `as_df` is True.
         include_private: Include private properties in the columns.  Only relevant if
             `as_df` is True.
         include_computed: Include computed properties in the columns.  Only relevant if
             `as_df` is True.
-        sanitize: Whether to sanitize the molecules
         strict_parsing: If set to false, the parser is more lax about correctness of the contents.
     """
 
     # File-like object
     if isinstance(urlpath, io.IOBase):
-        supplier = Chem.ForwardSDMolSupplier(urlpath, sanitize=False, strictParsing=strict_parsing)
-        mols = [mol for mol in supplier if mol is not None]
+        supplier = Chem.ForwardSDMolSupplier(
+            urlpath,
+            sanitize=sanitize,
+            strictParsing=strict_parsing,
+        )
+        mols = list(supplier)
 
     # Regular local or remote paths
     else:
         with fsspec.open(urlpath) as f:
+
+            # Handle gzip file if needed
             if str(urlpath).endswith(".gz") or str(urlpath).endswith(".gzip"):
                 f = gzip.open(f)
-            supplier = Chem.ForwardSDMolSupplier(f, sanitize=False, strictParsing=strict_parsing)
-            mols = [mol for mol in supplier if mol is not None]
 
-    if sanitize == True:
-        mols_props = [
-            (dm.sanitize_mol(mol), mol.GetPropsAsDict()) for mol in mols if mol is not None
-        ]
-        mols = [dm.set_mol_props(mol, props) for mol, props in mols_props]
+            supplier = Chem.ForwardSDMolSupplier(
+                f,
+                sanitize=sanitize,
+                strictParsing=strict_parsing,
+            )
+            mols = list(supplier)
 
+    # Discard None values
+    mols = [mol for mol in mols if mol is not None]
+
+    # Convert to dataframe
     if as_df:
         return dm.to_df(
             mols,
@@ -133,15 +145,15 @@ def read_sdf(
 
 
 def to_sdf(
-    mols: Union[Sequence[Chem.rdchem.Mol], pd.DataFrame],
+    mols: Union[Chem.rdchem.Mol, Sequence[Chem.rdchem.Mol], pd.DataFrame],
     urlpath: Union[str, os.PathLike, TextIO],
     smiles_column: Optional[str] = "smiles",
     mol_column: str = None,
 ):
     """Write molecules to a file.
 
     Args:
-        mols: a dataframe or a list of molecule.
+        mols: a dataframe, a molecule or a list of molecule.
         urlpath: Path to a file or a file-like object. Path can be remote or local.
         smiles_column: Column name to extract the molecule.
         mol_column: Column name to extract the molecule. It takes
@@ -151,6 +163,9 @@ def to_sdf(
     if isinstance(mols, pd.DataFrame):
         mols = dm.from_df(mols, smiles_column=smiles_column, mol_column=mol_column)
 
+    elif isinstance(mols, Chem.rdchem.Mol):
+        mols = [mols]
+
     # Filter out None values
     mols = [mol for mol in mols if mol is not None]
 
diff --git a/datamol/mol.py b/datamol/mol.py
@@ -8,6 +8,8 @@
 import copy
 import random
 
+from loguru import logger
+
 from rdkit import Chem
 from rdkit.Chem import rdmolops
 from rdkit.Chem.MolStandardize import rdMolStandardize
@@ -72,7 +74,7 @@ def to_mol(
 
     # Add hydrogens
     if _mol is not None and add_hs:
-        _mol = Chem.AddHs(_mol, explicitOnly=explicit_only)
+        _mol = Chem.AddHs(_mol, explicitOnly=explicit_only, addCoords=True)
 
     # Reorder atoms
     if _mol is not None and ordered:
@@ -154,38 +156,67 @@ def to_neutral(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
 
 
 def sanitize_mol(
-    mol: Chem.rdchem.Mol, charge_neutral: bool = False, sanifix: bool = True
+    mol: Chem.rdchem.Mol,
+    charge_neutral: bool = False,
+    sanifix: bool = True,
+    verbose: bool = True,
+    add_hs: bool = False,
 ) -> Optional[Chem.rdchem.Mol]:
-    """Sanitize molecule and fix common errors.
+    """An augmented version of RDKit `sanitize=True`. It uses a
+    mol-SMILES-mol conversion to catch potential aromaticity errors
+    and try to fix aromatic nitrogen (using the popular sanifix4 script).
+    Optionally, it can neutralize the charge of the molecule.
 
-    Warning:
-        The procedure includes a SMILES conversion to avoid accasional aromaticity
-        errors. In consequence, all the properties and the conformers will be lost.
+    Note #1: Only the first conformer (if present) will be preserved and
+    a warning will be displayed if more than one conformer is detected.
+
+    Note #2: The molecule's properties will be preserved but the atom's
+    properties will be lost.
 
     Args:
         mol: a molecule.
         charge_neutral: whether charge neutralization should be applied.
         sanifix: whether to run the sanifix from James Davidson
             (sanifix4.py) that try to adjust aromatic nitrogens.
+        verbose: Whether displaying a warning about multiple conformers.
+        add_hs: Add hydrogens to the returned molecule. Useful when the input
+            molecule already contains hydrogens.
 
     Returns:
         mol: a molecule.
     """
     if mol is None:
         return mol
 
+    # Extract properties.
+    original_mol = copy_mol(mol)
+    properties = original_mol.GetPropsAsDict()
+
     if charge_neutral:
         mol = to_neutral(mol)
 
     if sanifix:
         mol = _sanifix4.sanifix(mol)
 
-    if mol:
+    if mol is not None:
+
+        # Detect multiple conformers
+        if verbose and mol.GetNumConformers() > 1:
+            logger.warning(
+                f"The molecule contains multiple conformers. Only the first one will be preserved."
+            )
+
+        # Try catch to avoid occasional aromaticity errors
         try:
-            # Try catch to avoid occasional aromaticity errors
-            return to_mol(dm.to_smiles(mol), sanitize=True)  # type: ignore
+            # `cxsmiles` is used here to preserve the first conformer.
+            mol = to_mol(dm.to_smiles(mol, cxsmiles=True), sanitize=True, add_hs=add_hs)  # type: ignore
         except Exception:
-            return None
+            mol = None
+
+    if mol is not None:
+        # Insert back properties.
+        mol = dm.set_mol_props(mol, properties)
+
     return mol
 
 
@@ -276,7 +307,7 @@ def standardize_mol(
     Returns:
         mol: The standardized molecule.
     """
-    mol = copy.copy(mol)
+    mol = copy_mol(mol)
 
     if disconnect_metals:
         md = rdMolStandardize.MetalDisconnector()
@@ -584,7 +615,9 @@ def set_dative_bonds(
 
 
 def set_mol_props(
-    mol: Chem.rdchem.Mol, props: Dict[str, Any], copy: bool = False
+    mol: Chem.rdchem.Mol,
+    props: Dict[str, Any],
+    copy: bool = False,
 ) -> Chem.rdchem.Mol:
     """Set properties to a mol from a dict.
 
diff --git a/env.yml b/env.yml
@@ -44,8 +44,6 @@ dependencies:
   - markdown-include
   - mdx_truly_sane_lists
   - mike >=1.0.0
-  - markdown-it-py ==0.6.2  # no direct deps
-  - mkdocs-autorefs =0.1  # no direct deps
 
   # Releasing tools
   - rever >=0.4.5
diff --git a/news/sanitize-sdf.rst b/news/sanitize-sdf.rst
@@ -0,0 +1,28 @@
+**Added:**
+
+* Add a sanitize flag to `from_df`.
+* Automatically detect the mol column in `from_df`.
+* Add `add_hs` arg to `sanitize_mol`.
+
+**Changed:**
+
+* Allow input a single molecule to `dm.to_sdf` instead of a list of mol.
+* Preserve mol properties and the frist conformer in `dm.sanitize_mol`.
+* Display a warning message when input mol has multiple conformers in `dm.sanitize_mol`.
+
+**Deprecated:**
+
+* <news item>
+
+**Removed:**
+
+* <news item>
+
+**Fixed:**
+
+* Remove call to `sanitize_mol` in `read_sdf`, instead use `sanitize=True` from RDKit.
+* Remove the `mol` column from the mol properties in `from_df`. It also fixes `to_sdf`.
+
+**Security:**
+
+* <news item>
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,7 +1,10 @@
 import platform
 import pathlib
+from loguru import logger
 
 import pytest
+from _pytest.logging import caplog as _caplog
+
 
 DATA_DIR_PATH = pathlib.Path(__file__).parent.resolve() / "data"
 
@@ -36,3 +39,20 @@ def pytest_configure(config):
 @pytest.fixture
 def datadir(request):
     return DATA_DIR_PATH
+
+
+@pytest.fixture
+def caplog(_caplog):
+    """Monkeypatching the pytest caplog to work with loguru.
+
+    See https://loguru.readthedocs.io/en/latest/resources/migration.html#making-things-work-with-pytest-and-caplog
+    """
+    import logging
+
+    class PropogateHandler(logging.Handler):
+        def emit(self, record):
+            logging.getLogger(record.name).handle(record)
+
+    handler_id = logger.add(PropogateHandler(), format="{message}")
+    yield _caplog
+    logger.remove(handler_id)
diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -169,11 +169,10 @@ def test_to_df_smiles_warning(datadir, caplog):
 
     assert sum(df.columns == "smiles") == 2
 
-    for record in caplog.records:
-        assert record.levelname != "WARNING"
+    assert "WARNING" in caplog.text
     assert (
         "The SMILES column name provided ('smiles') is already present in the properties of the molecules"
-        not in caplog.text
+        in caplog.text
     )
 
 
@@ -190,3 +189,19 @@ def test_to_smiles_fail():
     # NOTE(hadim): ideally you want to catch only `Boost.Python.ArgumentError` here.
     with pytest.raises(Exception):
         dm.to_smiles(55, allow_to_fail=True)
+
+
+def test_from_df_pop_mol_column():
+    df = dm.data.freesolv().iloc[:10]  # type: ignore
+    mols = [dm.to_mol(smiles) for smiles in df["smiles"]]
+
+    df: pd.DataFrame = dm.to_df(mols, mol_column="mol")  # type: ignore
+    df["dummy"] = "hello"
+
+    # test with provided mol column
+    mols = dm.from_df(df.copy(), mol_column="mol")
+    assert set(mols[0].GetPropsAsDict().keys()) == {"smiles", "dummy"}
+
+    # test with automatic mol column detection
+    mols = dm.from_df(df.copy())
+    assert set(mols[0].GetPropsAsDict().keys()) == {"smiles", "dummy"}
diff --git a/tests/test_io.py b/tests/test_io.py
diff --git a/tests/test_mol.py b/tests/test_mol.py