Source code for molvs.fragment

# -*- coding: utf-8 -*-
"""
molvs.fragment
~~~~~~~~~~~~~~

This module contains tools for dealing with molecules with more than one covalently bonded unit. The main classes are
:class:`~molvs.fragment.LargestFragmentChooser`, which returns the largest covalent unit in a molecule, and
:class:`~molvs.fragment.FragmentRemover`, which filters out fragments from a molecule using SMARTS patterns.

"""

from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
import logging

from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

from .utils import memoized_property


log = logging.getLogger(__name__)


[docs]class FragmentPattern(object): """A fragment defined by a SMARTS pattern.""" def __init__(self, name, smarts): """Initialize a FragmentPattern with a name and a SMARTS pattern. :param name: A name for this FragmentPattern. :param smarts: A SMARTS pattern. """ self.name = name self.smarts_str = smarts @memoized_property def smarts(self): return Chem.MolFromSmarts(self.smarts_str) def __repr__(self): return 'FragmentPattern({!r}, {!r})'.format(self.name, self.smarts_str) def __str__(self): return self.name
#: The default list of :class:`FragmentPatterns <molvs.fragment.FragmentPattern>` to be used by #: :class:`~molvs.fragment.FragmentRemover`. REMOVE_FRAGMENTS = ( FragmentPattern('hydrogen', '[H]'), FragmentPattern('fluorine', '[F]'), FragmentPattern('chlorine', '[Cl]'), FragmentPattern('bromine', '[Br]'), FragmentPattern('iodine', '[I]'), FragmentPattern('lithium', '[Li]'), FragmentPattern('sodium', '[Na]'), FragmentPattern('potassium', '[K]'), FragmentPattern('calcium', '[Ca]'), FragmentPattern('magnesium', '[Mg]'), FragmentPattern('aluminium', '[Al]'), FragmentPattern('barium', '[Ba]'), FragmentPattern('bismuth', '[Bi]'), FragmentPattern('silver', '[Ag]'), FragmentPattern('strontium', '[Sr]'), FragmentPattern('zinc', '[Zn]'), FragmentPattern('ammonia/ammonium', '[#7]'), FragmentPattern('water/hydroxide', '[#8]'), FragmentPattern('methyl amine', '[#6]-[#7]'), FragmentPattern('sulfide', 'S'), FragmentPattern('nitrate', '[#7](=[#8])(-[#8])-[#8]'), FragmentPattern('phosphate', '[P](=[#8])(-[#8])(-[#8])-[#8]'), FragmentPattern('hexafluorophosphate', '[P](-[#9])(-[#9])(-[#9])(-[#9])(-[#9])-[#9]'), FragmentPattern('sulfate', '[S](=[#8])(=[#8])(-[#8])-[#8]'), FragmentPattern('methyl sulfonate', '[#6]-[S](=[#8])(=[#8])(-[#8])'), FragmentPattern('trifluoromethanesulfonic acid', '[#8]-[S](=[#8])(=[#8])-[#6](-[#9])(-[#9])-[#9]'), FragmentPattern('trifluoroacetic acid', '[#9]-[#6](-[#9])(-[#9])-[#6](=[#8])-[#8]'), FragmentPattern('1,2-dichloroethane', '[Cl]-[#6]-[#6]-[Cl]'), FragmentPattern('1,2-dimethoxyethane', '[#6]-[#8]-[#6]-[#6]-[#8]-[#6]'), FragmentPattern('1,4-dioxane', '[#6]-1-[#6]-[#8]-[#6]-[#6]-[#8]-1'), FragmentPattern('1-methyl-2-pyrrolidinone', '[#6]-[#7]-1-[#6]-[#6]-[#6]-[#6]-1=[#8]'), FragmentPattern('2-butanone', '[#6]-[#6]-[#6](-[#6])=[#8]'), FragmentPattern('acetate/acetic acid', '[#8]-[#6](-[#6])=[#8]'), FragmentPattern('acetone', '[#6]-[#6](-[#6])=[#8]'), FragmentPattern('acetonitrile', '[#6]-[#6]#[N]'), FragmentPattern('benzene', '[#6]1[#6][#6][#6][#6][#6]1'), FragmentPattern('butanol', '[#8]-[#6]-[#6]-[#6]-[#6]'), FragmentPattern('t-butanol', '[#8]-[#6](-[#6])(-[#6])-[#6]'), FragmentPattern('chloroform', '[Cl]-[#6](-[Cl])-[Cl]'), FragmentPattern('cycloheptane', '[#6]-1-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-1'), FragmentPattern('cyclohexane', '[#6]-1-[#6]-[#6]-[#6]-[#6]-[#6]-1'), FragmentPattern('dichloromethane', '[Cl]-[#6]-[Cl]'), FragmentPattern('diethyl ether', '[#6]-[#6]-[#8]-[#6]-[#6]'), FragmentPattern('diisopropyl ether', '[#6]-[#6](-[#6])-[#8]-[#6](-[#6])-[#6]'), FragmentPattern('dimethyl formamide', '[#6]-[#7](-[#6])-[#6]=[#8]'), FragmentPattern('dimethyl sulfoxide', '[#6]-[S](-[#6])=[#8]'), FragmentPattern('ethanol', '[#8]-[#6]-[#6]'), FragmentPattern('ethyl acetate', '[#6]-[#6]-[#8]-[#6](-[#6])=[#8]'), FragmentPattern('formic acid', '[#8]-[#6]=[#8]'), FragmentPattern('heptane', '[#6]-[#6]-[#6]-[#6]-[#6]-[#6]-[#6]'), FragmentPattern('hexane', '[#6]-[#6]-[#6]-[#6]-[#6]-[#6]'), FragmentPattern('isopropanol', '[#8]-[#6](-[#6])-[#6]'), FragmentPattern('methanol', '[#8]-[#6]'), FragmentPattern('N,N-dimethylacetamide', '[#6]-[#7](-[#6])-[#6](-[#6])=[#8]'), FragmentPattern('pentane', '[#6]-[#6]-[#6]-[#6]-[#6]'), FragmentPattern('propanol', '[#8]-[#6]-[#6]-[#6]'), FragmentPattern('pyridine', '[#6]-1=[#6]-[#6]=[#7]-[#6]=[#6]-1'), FragmentPattern('t-butyl methyl ether', '[#6]-[#8]-[#6](-[#6])(-[#6])-[#6]'), FragmentPattern('tetrahydrofurane', '[#6]-1-[#6]-[#6]-[#8]-[#6]-1'), FragmentPattern('toluene', '[#6]-[#6]~1~[#6]~[#6]~[#6]~[#6]~[#6]~1'), FragmentPattern('xylene', '[#6]-[#6]~1~[#6](-[#6])~[#6]~[#6]~[#6]~[#6]~1') ) #: The default value for whether to ensure at least one fragment is left after FragmentRemover is applied. LEAVE_LAST = True #: The default value for whether LargestFragmentChooser sees organic fragments as "larger" than inorganic fragments. PREFER_ORGANIC = False
[docs]def is_organic(fragment): """Return true if fragment contains at least one carbon atom. :param fragment: The fragment as an RDKit Mol object. """ # TODO: Consider a different definition? # Could allow only H, C, N, O, S, P, F, Cl, Br, I for a in fragment.GetAtoms(): if a.GetAtomicNum() == 6: return True return False
[docs]class FragmentRemover(object): """A class for filtering out fragments using SMARTS patterns.""" def __init__(self, fragments=REMOVE_FRAGMENTS, leave_last=LEAVE_LAST): """Initialize a FragmentRemover with an optional custom list of :class:`~molvs.fragment.FragmentPattern`. Setting leave_last to True will ensure at least one fragment is left in the molecule, even if it is matched by a :class:`~molvs.fragment.FragmentPattern`. Fragments are removed in the order specified in the list, so place those you would prefer to be left towards the end of the list. If all the remaining fragments match the same :class:`~molvs.fragment.FragmentPattern`, they will all be left. :param fragments: A list of :class:`~molvs.fragment.FragmentPattern` to remove. :param bool leave_last: Whether to ensure at least one fragment is left. """ log.debug('Initializing FragmentRemover') self.fragments = fragments self.leave_last = leave_last
[docs] def __call__(self, mol): """Calling a FragmentRemover instance like a function is the same as calling its remove(mol) method.""" return self.remove(mol)
[docs] def remove(self, mol): """Return the molecule with specified fragments removed. :param mol: The molecule to remove fragments from. :type mol: rdkit.Chem.rdchem.Mol :return: The molecule with fragments removed. :rtype: rdkit.Chem.rdchem.Mol """ log.debug('Running FragmentRemover') # Iterate FragmentPatterns and remove matching fragments for frag in self.fragments: # If nothing is left or leave_last and only one fragment, end here if mol.GetNumAtoms() == 0 or (self.leave_last and len(Chem.GetMolFrags(mol)) <= 1): break # Apply removal for this FragmentPattern removed = Chem.DeleteSubstructs(mol, frag.smarts, onlyFrags=True) if not mol.GetNumAtoms() == removed.GetNumAtoms(): log.info('Removed fragment: %s', frag.name) if self.leave_last and removed.GetNumAtoms() == 0: # All the remaining fragments match this pattern - leave them all break mol = removed return mol
[docs]class LargestFragmentChooser(object): """A class for selecting the largest covalent unit in a molecule with multiple fragments.""" def __init__(self, prefer_organic=PREFER_ORGANIC): """ If prefer_organic is set to True, any organic fragment will be considered larger than any inorganic fragment. A fragment is considered organic if it contains a carbon atom. :param bool prefer_organic: Whether to prioritize organic fragments above all others. """ log.debug('Initializing LargestFragmentChooser') self.prefer_organic = prefer_organic
[docs] def __call__(self, mol): """Calling a LargestFragmentChooser instance like a function is the same as calling its choose(mol) method.""" return self.choose(mol)
[docs] def choose(self, mol): """Return the largest covalent unit. The largest fragment is determined by number of atoms (including hydrogens). Ties are broken by taking the fragment with the higher molecular weight, and then by taking the first alphabetically by SMILES if needed. :param mol: The molecule to choose the largest fragment from. :type mol: rdkit.Chem.rdchem.Mol :return: The largest fragment. :rtype: rdkit.Chem.rdchem.Mol """ log.debug('Running LargestFragmentChooser') # TODO: Alternatively allow a list of fragments to be passed as the mol parameter fragments = Chem.GetMolFrags(mol, asMols=True) largest = None for f in fragments: smiles = Chem.MolToSmiles(f, isomericSmiles=True) log.debug('Fragment: %s', smiles) organic = is_organic(f) if self.prefer_organic: # Skip this fragment if not organic and we already have an organic fragment as the largest so far if largest and largest['organic'] and not organic: continue # Reset largest if it wasn't organic and this fragment is organic if largest and organic and not largest['organic']: largest = None # Count atoms atoms = 0 for a in f.GetAtoms(): atoms += 1 + a.GetTotalNumHs() # Skip this fragment if fewer atoms than the largest if largest and atoms < largest['atoms']: continue # Skip this fragment if equal number of atoms but weight is lower weight = rdMolDescriptors.CalcExactMolWt(f) if largest and atoms == largest['atoms'] and weight < largest['weight']: continue # Skip this fragment if equal atoms and equal weight but smiles comes last alphabetically if largest and atoms == largest['atoms'] and weight == largest['weight'] and smiles > largest['smiles']: continue # Otherwise this is the largest so far log.debug('New largest fragment: %s (%s)', smiles, atoms) largest = {'smiles': smiles, 'fragment': f, 'atoms': atoms, 'weight': weight, 'organic': organic} return largest['fragment']