# -*- coding: utf-8 -*-
"""
molvs.standardize
~~~~~~~~~~~~~~~~~
This module contains the main :class:`~molvs.standardize.Standardizer` class that can be used to perform all
standardization tasks, as well as convenience functions like :func:`~molvs.standardize.standardize_smiles` for common
standardization tasks.
"""
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
import copy
import logging
from rdkit import Chem
from .metal import MetalDisconnector
from .fragment import PREFER_ORGANIC, LargestFragmentChooser, FragmentRemover
from .normalize import NORMALIZATIONS, MAX_RESTARTS, Normalizer
from .tautomer import TAUTOMER_TRANSFORMS, TAUTOMER_SCORES, MAX_TAUTOMERS, TautomerCanonicalizer, TautomerEnumerator
from .charge import ACID_BASE_PAIRS, CHARGE_CORRECTIONS, Reionizer, Uncharger
from .utils import memoized_property
log = logging.getLogger(__name__)
[docs]class Standardizer(object):
"""The main class for performing standardization of molecules and deriving parent molecules.
The primary usage is via the :meth:`~molvs.standardize.Standardizer.standardize` method::
s = Standardizer()
mol1 = Chem.MolFromSmiles('C1=CC=CC=C1')
mol2 = s.standardize(mol1)
There are separate methods to derive fragment, charge, tautomer, isotope and stereo parent molecules.
"""
def __init__(self, normalizations=NORMALIZATIONS, acid_base_pairs=ACID_BASE_PAIRS,
charge_corrections=CHARGE_CORRECTIONS, tautomer_transforms=TAUTOMER_TRANSFORMS,
tautomer_scores=TAUTOMER_SCORES, max_restarts=MAX_RESTARTS, max_tautomers=MAX_TAUTOMERS,
prefer_organic=PREFER_ORGANIC):
"""Initialize a Standardizer with optional custom parameters.
:param normalizations: A list of Normalizations to apply (default: :data:`~molvs.normalize.NORMALIZATIONS`).
:param acid_base_pairs: A list of AcidBasePairs for competitive reionization (default:
:data:`~molvs.charge.ACID_BASE_PAIRS`).
:param charge_corrections: A list of ChargeCorrections to apply (default:
:data:`~molvs.charge.CHARGE_CORRECTIONS`).
:param tautomer_transforms: A list of TautomerTransforms to apply (default:
:data:`~molvs.tautomer.TAUTOMER_TRANSFORMS`).
:param tautomer_scores: A list of TautomerScores used to determine canonical tautomer (default:
:data:`~molvs.tautomer.TAUTOMER_SCORES`).
:param max_restarts: The maximum number of times to attempt to apply the series of normalizations (default 200).
:param max_tautomers: The maximum number of tautomers to enumerate (default 1000).
:param prefer_organic: Whether to prioritize organic fragments when choosing fragment parent (default False).
"""
log.debug('Initializing Standardizer')
self.normalizations = normalizations
self.acid_base_pairs = acid_base_pairs
self.charge_corrections = charge_corrections
self.tautomer_transforms = tautomer_transforms
self.tautomer_scores = tautomer_scores
self.max_restarts = max_restarts
self.max_tautomers = max_tautomers
self.prefer_organic = prefer_organic
[docs] def __call__(self, mol):
"""Calling a Standardizer instance like a function is the same as calling its
:meth:`~molvs.standardize.Standardizer.standardize` method."""
return self.standardize(mol)
[docs] def standardize(self, mol):
"""Return a standardized version the given molecule.
The standardization process consists of the following stages: RDKit
:py:func:`~rdkit.Chem.rdmolops.RemoveHs`, RDKit :py:func:`~rdkit.Chem.rdmolops.SanitizeMol`,
:class:`~molvs.metal.MetalDisconnector`, :class:`~molvs.normalize.Normalizer`,
:class:`~molvs.charge.Reionizer`, RDKit :py:func:`~rdkit.Chem.rdmolops.AssignStereochemistry`.
:param mol: The molecule to standardize.
:type mol: rdkit.Chem.rdchem.Mol
:returns: The standardized molecule.
:rtype: rdkit.Chem.rdchem.Mol
"""
mol = copy.deepcopy(mol)
Chem.SanitizeMol(mol)
mol = Chem.RemoveHs(mol)
mol = self.disconnect_metals(mol)
mol = self.normalize(mol)
mol = self.reionize(mol)
Chem.AssignStereochemistry(mol, force=True, cleanIt=True)
# TODO: Check this removes symmetric stereocenters
return mol
[docs] def tautomer_parent(self, mol, skip_standardize=False):
"""Return the tautomer parent of a given molecule.
:param mol: The input molecule.
:type mol: rdkit.Chem.rdchem.Mol
:param bool skip_standardize: Set to True if mol has already been standardized.
:returns: The tautomer parent molecule.
:rtype: rdkit.Chem.rdchem.Mol
"""
if not skip_standardize:
mol = self.standardize(mol)
tautomer = self.canonicalize_tautomer(mol)
tautomer = self.standardize(tautomer)
return tautomer
[docs] def fragment_parent(self, mol, skip_standardize=False):
"""Return the fragment parent of a given molecule.
The fragment parent is the largest organic covalent unit in the molecule.
:param mol: The input molecule.
:type mol: rdkit.Chem.rdchem.Mol
:param bool skip_standardize: Set to True if mol has already been standardized.
:returns: The fragment parent molecule.
:rtype: rdkit.Chem.rdchem.Mol
"""
if not skip_standardize:
mol = self.standardize(mol)
# TODO: Consider applying FragmentRemover first to remove salts, solvents?
fragment = self.largest_fragment(mol)
return fragment
[docs] def stereo_parent(self, mol, skip_standardize=False):
"""Return the stereo parent of a given molecule.
The stereo parent has all stereochemistry information removed from tetrahedral centers and double bonds.
:param mol: The input molecule.
:type mol: rdkit.Chem.rdchem.Mol
:param bool skip_standardize: Set to True if mol has already been standardized.
:returns: The stereo parent molecule.
:rtype: rdkit.Chem.rdchem.Mol
"""
if not skip_standardize:
mol = self.standardize(mol)
else:
mol = copy.deepcopy(mol)
Chem.RemoveStereochemistry(mol)
return mol
[docs] def isotope_parent(self, mol, skip_standardize=False):
"""Return the isotope parent of a given molecule.
The isotope parent has all atoms replaced with the most abundant isotope for that element.
:param mol: The input molecule.
:type mol: rdkit.Chem.rdchem.Mol
:param bool skip_standardize: Set to True if mol has already been standardized.
:returns: The isotope parent molecule.
:rtype: rdkit.Chem.rdchem.Mol
"""
if not skip_standardize:
mol = self.standardize(mol)
else:
mol = copy.deepcopy(mol)
# Replace isotopes with common weight
for atom in mol.GetAtoms():
atom.SetIsotope(0)
return mol
[docs] def charge_parent(self, mol, skip_standardize=False):
"""Return the charge parent of a given molecule.
The charge parent is the uncharged version of the fragment parent.
:param mol: The input molecule.
:type mol: rdkit.Chem.rdchem.Mol
:param bool skip_standardize: Set to True if mol has already been standardized.
:returns: The charge parent molecule.
:rtype: rdkit.Chem.rdchem.Mol
"""
# TODO: All ionized acids and bases should be neutralised.
if not skip_standardize:
mol = self.standardize(mol)
fragment = self.fragment_parent(mol, skip_standardize=True)
if fragment:
uncharged = self.uncharge(fragment)
# During final standardization, the Reionizer ensures any remaining charges are in the right places
uncharged = self.standardize(uncharged)
return uncharged
[docs] def super_parent(self, mol, skip_standardize=False):
"""Return the super parent of a given molecule.
THe super parent is fragment, charge, isotope, stereochemistry and tautomer insensitive. From the input
molecule, the largest fragment is taken. This is uncharged and then isotope and stereochemistry information is
discarded. Finally, the canonical tautomer is determined and returned.
:param mol: The input molecule.
:type mol: rdkit.Chem.rdchem.Mol
:param bool skip_standardize: Set to True if mol has already been standardized.
:returns: The super parent molecule.
:rtype: rdkit.Chem.rdchem.Mol
"""
if not skip_standardize:
mol = self.standardize(mol)
# We don't need to get fragment parent, because the charge parent is the largest fragment
mol = self.charge_parent(mol, skip_standardize=True)
mol = self.isotope_parent(mol, skip_standardize=True)
mol = self.stereo_parent(mol, skip_standardize=True)
mol = self.tautomer_parent(mol, skip_standardize=True)
mol = self.standardize(mol)
return mol
def standardize_with_parents(self, mol):
""""""
standardized = self.standardize(mol)
tautomer = self.tautomer_parent(standardized, skip_standardize=True)
super = self.super_parent(standardized, skip_standardize=True)
# TODO: Add other parents - have optional argument to specify which are wanted
mols = {
'standardized': standardized,
'tautomer_parent': tautomer,
'super_parent': super
}
return mols
# TODO: All unique tautomers
# TODO: All unique fragments (each has to be standardized again?)
@memoized_property
def disconnect_metals(self):
"""
:returns: A callable :class:`~molvs.metal.MetalDisconnector` instance.
"""
return MetalDisconnector()
@memoized_property
def normalize(self):
"""
:returns: A callable :class:`~molvs.normalize.Normalizer` instance.
"""
return Normalizer(normalizations=self.normalizations, max_restarts=self.max_restarts)
@memoized_property
def reionize(self):
"""
:returns: A callable :class:`~molvs.charge.Reionizer` instance.
"""
return Reionizer(acid_base_pairs=self.acid_base_pairs, charge_corrections=self.charge_corrections)
@memoized_property
def uncharge(self):
"""
:returns: A callable :class:`~molvs.charge.Uncharger` instance.
"""
return Uncharger(acid_base_pairs=self.acid_base_pairs)
@memoized_property
def remove_fragments(self):
"""
:returns: A callable :class:`~molvs.fragment.FragmentRemover` instance.
"""
return FragmentRemover()
@memoized_property
def largest_fragment(self):
"""
:returns: A callable :class:`~molvs.fragment.LargestFragmentChooser` instance.
"""
return LargestFragmentChooser(prefer_organic=self.prefer_organic)
@memoized_property
def enumerate_tautomers(self):
"""
:returns: A callable :class:`~molvs.tautomer.TautomerEnumerator` instance.
"""
return TautomerEnumerator(transforms=self.tautomer_transforms, max_tautomers=self.max_tautomers)
@memoized_property
def canonicalize_tautomer(self):
"""
:returns: A callable :class:`~molvs.tautomer.TautomerCanonicalizer` instance.
"""
return TautomerCanonicalizer(transforms=self.tautomer_transforms, scores=self.tautomer_scores,
max_tautomers=self.max_tautomers)
[docs]def standardize_smiles(smiles):
"""Return a standardized canonical SMILES string given a SMILES string.
Note: This is a convenience function for quickly standardizing a single SMILES string. It is more efficient to use
the :class:`~molvs.standardize.Standardizer` class directly when working with many molecules or when custom options
are needed.
:param string smiles: The SMILES for the molecule.
:returns: The SMILES for the standardized molecule.
:rtype: string.
"""
# Skip sanitize as standardize does this anyway
mol = Chem.MolFromSmiles(smiles, sanitize=False)
mol = Standardizer().standardize(mol)
return Chem.MolToSmiles(mol, isomericSmiles=True)
[docs]def enumerate_tautomers_smiles(smiles):
"""Return a set of tautomers as SMILES strings, given a SMILES string.
:param smiles: A SMILES string.
:returns: A set containing SMILES strings for every possible tautomer.
:rtype: set of strings.
"""
# Skip sanitize as standardize does this anyway
mol = Chem.MolFromSmiles(smiles, sanitize=False)
mol = Standardizer().standardize(mol)
tautomers = TautomerEnumerator().enumerate(mol)
return {Chem.MolToSmiles(m, isomericSmiles=True) for m in tautomers}
[docs]def canonicalize_tautomer_smiles(smiles):
"""Return a standardized canonical tautomer SMILES string given a SMILES string.
Note: This is a convenience function for quickly standardizing and finding the canonical tautomer for a single
SMILES string. It is more efficient to use the :class:`~molvs.standardize.Standardizer` class directly when working
with many molecules or when custom options are needed.
:param string smiles: The SMILES for the molecule.
:returns: The SMILES for the standardize canonical tautomer.
:rtype: string.
"""
# Skip sanitize as standardize does this anyway
mol = Chem.MolFromSmiles(smiles, sanitize=False)
mol = Standardizer().standardize(mol)
tautomer = TautomerCanonicalizer().canonicalize(mol)
return Chem.MolToSmiles(tautomer, isomericSmiles=True)