Source code for molvs.validations

# -*- coding: utf-8 -*-
"""
molvs.validations
~~~~~~~~~~~~~~~~~

This module contains all the built-in :class:`Validations <molvs.validations.Validation>`.

"""

from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
import logging

from rdkit import Chem

from .errors import StopValidateError
from .fragment import REMOVE_FRAGMENTS


[docs]class Validation(object): """The base class that all :class:`~molvs.validations.Validation` subclasses must inherit from.""" def __init__(self, log): self.log = logging.LoggerAdapter(log, {'validation': type(self).__name__}) def __call__(self, mol): try: self.log.debug('Running %s', type(self).__name__) self.run(mol) except Exception as e: if isinstance(e, StopValidateError): raise e else: self.log.debug('Validation failed: %s', e) def run(self, mol): """""" raise NotImplementedError("Validation subclasses must implement the run method")
[docs]class SmartsValidation(Validation): """Abstract superclass for :class:`Validations <molvs.validations.Validation>` that log a message if a SMARTS pattern matches the molecule. Subclasses can override the following attributes: """ #: The logging level of the message. level = logging.INFO #: The message to log if the SMARTS pattern matches the molecule. message = 'Molecule matched %(smarts)s' #: Whether the SMARTS pattern should match an entire covalent unit. entire_fragment = False def __init__(self, log): super(SmartsValidation, self).__init__(log) self._smarts = Chem.MolFromSmarts(self.smarts) @property def smarts(self): """The SMARTS pattern as a string. Subclasses must implement this.""" raise NotImplementedError('SmartsValidation subclasses must have a smarts attribute') def _check_matches(self, mol): if mol.HasSubstructMatch(self._smarts): self.log.log(self.level, self.message, {'smarts': self.smarts}) def _check_matches_fragment(self, mol): matches = frozenset(frozenset(match) for match in mol.GetSubstructMatches(self._smarts)) fragments = frozenset(frozenset(frag) for frag in Chem.GetMolFrags(mol)) if matches & fragments: self.log.log(self.level, self.message, {'smarts': self.smarts}) def run(self, mol): if self.entire_fragment: self._check_matches_fragment(mol) else: self._check_matches(mol)
[docs]class IsNoneValidation(Validation): """Logs an error if ``None`` is passed to the Validator. This can happen if RDKit failed to parse an input format. If the molecule is ``None``, no subsequent validations will run. """ def run(self, mol): if mol is None: self.log.error('Molecule is None') raise StopValidateError()
[docs]class NoAtomValidation(Validation): """Logs an error if the molecule has zero atoms. If the molecule has no atoms, no subsequent validations will run. """ def run(self, mol): if mol.GetNumAtoms() == 0: self.log.error('No atoms are present') raise StopValidateError()
[docs]class DichloroethaneValidation(SmartsValidation): """Logs if 1,2-dichloroethane is present. This is provided as an example of how to subclass :class:`~molvs.validations.SmartsValidation` to check for the presence of a substructure. """ level = logging.INFO smarts = '[Cl]-[#6]-[#6]-[Cl]' entire_fragment = True message = '1,2-Dichloroethane is present'
[docs]class FragmentValidation(Validation): """Logs if certain fragments are present. Subclass and override the ``fragments`` class attribute to customize the list of :class:`FragmentPatterns <molvs.fragment.FragmentPattern>`. """ #: A list of :class:`FragmentPatterns <molvs.fragment.FragmentPattern>` to check for. fragments = REMOVE_FRAGMENTS def run(self, mol): for fp in self.fragments: matches = frozenset(frozenset(match) for match in mol.GetSubstructMatches(fp.smarts)) fragments = frozenset(frozenset(frag) for frag in Chem.GetMolFrags(mol)) if matches & fragments: self.log.info('%s is present', fp.name)
[docs]class NeutralValidation(Validation): """Logs if not an overall neutral system.""" def run(self, mol): charge = Chem.GetFormalCharge(mol) if not charge == 0: chargestring = '+%s' % charge if charge > 0 else '%s' % charge self.log.info('Not an overall neutral system (%s)', chargestring)
[docs]class IsotopeValidation(Validation): """Logs if molecule contains isotopes.""" def run(self, mol): isotopes = set() for atom in mol.GetAtoms(): isotope = atom.GetIsotope() if not isotope == 0: isotopes.add('%s%s' % (isotope, atom.GetSymbol())) for isotope in isotopes: self.log.info('Molecule contains isotope %s', isotope)
#: The default list of :class:`Validations <molvs.validations.Validation>` used by :class:`~molvs.validate.Validator`. VALIDATIONS = ( IsNoneValidation, NoAtomValidation, #DichloroethaneValidation, FragmentValidation, NeutralValidation, IsotopeValidation, ) # - WARN/ERROR: Are all atoms defined/real - no query atoms or invalid elements, r-group things # - INFO: Contains unknown stereo (Perform stereochemistry perception first?) # - INFO: Nonstandard tautomer (log SMILES of tautomer parent, or the name of the tautomer transform?) # - WARN: InChI generation failed # - WARN: Contains covalent bond to metal (that would be broken by MetalDisconnector) # - WARN: Contains solvent molecules (in addition other fragment) # - WARN: More than 99 rings causes problems with SMILES # - INFO: Cis azo dye is unusual # - WARN: Adjacent atoms with like charges (i.e. both positive or both negative) # - INFO: Has more than one radical centre # - INFO: ethane, methane molecules present # - INFO: Boron, Sulfur atoms with no explicit bonds # - INFO: Solvent molecules present (only if also other fragments) # - INFO: One unknown stereocentre and no defined stereocentres (probably racemate, so info not warn) # - WARN: More than one undefined stereocentre and no defined stereocentres # - INFO: One undefined stereocentre and at least one defined stereocentre (epimer or mixture of anomers, so info not warn) # - WARN: More than one undefined stereocentre and at least one defined stereocentre # - INFO: Unknown double bond stereochemistry # - WARN: Ring containing stereobonds? # - INFO: Not canonical tautomer # Coordinates? # Info - Lack of coordinates? Uneven bond lengths? # Web services (needs to be optional) # Info - Could not match to ChemSpider ID, PubChem CID # UniChem from EBI could be useful here, otherwise use each API directly # Allow definition of MolSchema to set custom validations on e.g. # People can define a filterer # This has a series of validations, and the required output - e.g. no error or no warns?