MayaChemTools

    1 #!/bin/env python
    2 #
    3 # File: RDKitPerformPositionalAnalogueScan.py
    4 # Author: Manish Sud <msud@san.rr.com>
    5 #
    6 # Copyright (C) 2024 Manish Sud. All rights reserved.
    7 #
    8 # The functionality available in this script is implemented using RDKit, an
    9 # open source toolkit for cheminformatics developed by Greg Landrum.
   10 #
   11 # This file is part of MayaChemTools.
   12 #
   13 # MayaChemTools is free software; you can redistribute it and/or modify it under
   14 # the terms of the GNU Lesser General Public License as published by the Free
   15 # Software Foundation; either version 3 of the License, or (at your option) any
   16 # later version.
   17 #
   18 # MayaChemTools is distributed in the hope that it will be useful, but without
   19 # any warranty; without even the implied warranty of merchantability of fitness
   20 # for a particular purpose.  See the GNU Lesser General Public License for more
   21 # details.
   22 #
   23 # You should have received a copy of the GNU Lesser General Public License
   24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
   25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
   26 # Boston, MA, 02111-1307, USA.
   27 #
   28 
   29 from __future__ import print_function
   30 
   31 # Add local python path to the global path and import standard library modules...
   32 import os
   33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
   34 import time
   35 import re
   36 import multiprocessing as mp
   37 from itertools import combinations
   38 from itertools import permutations
   39 
   40 # RDKit imports...
   41 try:
   42     from rdkit import rdBase
   43     from rdkit import Chem
   44     from rdkit.Chem import AllChem
   45 except ImportError as ErrMsg:
   46     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
   47     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
   48     sys.exit(1)
   49 
   50 # MayaChemTools imports...
   51 try:
   52     from docopt import docopt
   53     import MiscUtil
   54     import RDKitUtil
   55 except ImportError as ErrMsg:
   56     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
   57     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
   58     sys.exit(1)
   59 
   60 ScriptName = os.path.basename(sys.argv[0])
   61 Options = {}
   62 OptionsInfo = {}
   63 
   64 def main():
   65     """Start execution of the script."""
   66     
   67     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
   68     
   69     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
   70     
   71     # Retrieve command line arguments and options...
   72     RetrieveOptions()
   73     
   74     # Process and validate command line arguments and options...
   75     ProcessOptions()
   76     
   77     # Perform actions required by the script...
   78     PerformPositionalAnalogueScan()
   79     
   80     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
   81     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
   82 
   83 def PerformPositionalAnalogueScan():
   84     """Perform positional analogue scan."""
   85     
   86     # Setup a molecule reader for input file...
   87     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
   88     OptionsInfo["InfileParams"]["AllowEmptyMols"] = True
   89     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
   90 
   91     # Set up a molecule writer...
   92     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
   93     if Writer is None:
   94         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
   95     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
   96 
   97     MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount = ProcessMolecules(Mols, Writer)
   98 
   99     if Writer is not None:
  100         Writer.close()
  101     
  102     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
  103     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
  104     if not OptionsInfo["RxnSMARTSMode"]:
  105         MiscUtil.PrintInfo("Number of molecules with missing search pattern: %d" % SearchPatternMissingCount)
  106     MiscUtil.PrintInfo("Number of molecules failed during generation of analogues: %d" % AnaloguesGenerationFailedCount)
  107     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount + SearchPatternMissingCount + AnaloguesGenerationFailedCount))
  108     
  109     MiscUtil.PrintInfo("\nTotal number of analogues for valid molecules: %d" % AnaloguesCount)
  110     MiscUtil.PrintInfo("Average number of analogues per valid molecule: %d" % (int(AnaloguesCount/(ValidMolCount))))
  111 
  112 def ProcessMolecules(Mols, Writer):
  113     """Process molecules to generate analogues."""
  114     
  115     if OptionsInfo["MPMode"]:
  116         return ProcessMoleculesUsingMultipleProcesses( Mols, Writer)
  117     else:
  118         return ProcessMoleculesUsingSingleProcess(Mols, Writer)
  119 
  120 def ProcessMoleculesUsingSingleProcess(Mols, Writer):
  121     """Process molecules to generate analogues using a single process."""
  122 
  123     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
  124     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
  125 
  126     (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount) = [0] * 5
  127 
  128     # Initialize search and target patterns info...
  129     SetupSearchAndTargetPatternsInfo()
  130 
  131     FirstMol = True
  132     for Mol in Mols:
  133         MolCount += 1
  134         
  135         if Mol is None:
  136             continue
  137         
  138         if RDKitUtil.IsMolEmpty(Mol):
  139             if not OptionsInfo["QuietMode"]:
  140                 MolName = RDKitUtil.GetMolName(Mol, MolCount)
  141                 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
  142             continue
  143         ValidMolCount += 1
  144 
  145         MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus = GenerateMolAnalogues(Mol, MolCount)
  146         
  147         if FirstMol:
  148             FirstMol = False
  149             if SetSMILESMolProps:
  150                 if Writer is not None:
  151                     RDKitUtil.SetWriterMolProps(Writer, Mol)
  152         
  153         if not OptionsInfo["RxnSMARTSMode"]:
  154             if SearchPatternMissingStatus:
  155                 SearchPatternMissingCount += 1
  156                 WriteMolecule(Writer, Mol, Compute2DCoords)
  157                 continue
  158         
  159         if not AnaloguesGenerationStatus:
  160             AnaloguesGenerationFailedCount += 1
  161             WriteMolecule(Writer, Mol, Compute2DCoords)
  162             continue
  163         
  164         # Write out molecues and analogues...
  165         WriteMolAnalogues(Writer, Mol, MolCount, MolAnalogues, Compute2DCoords)
  166         
  167         AnaloguesCount += len(MolAnalogues)
  168 
  169     return (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount)
  170 
  171 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer):
  172     """Process molecules to generate analogues using multiprocessing."""
  173 
  174     MPParams = OptionsInfo["MPParams"]
  175     
  176     # Setup data for initializing a worker process...
  177     MiscUtil.PrintInfo("Encoding options info...")
  178 
  179     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
  180     
  181     # Setup a encoded mols data iterable for a worker process...
  182     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
  183 
  184     # Setup process pool along with data initialization for each process...
  185     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
  186     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
  187     
  188     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
  189     
  190     # Start processing...
  191     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
  192         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
  193     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
  194         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
  195     else:
  196         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
  197     
  198     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
  199     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
  200 
  201     (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount) = [0] * 5
  202     FirstMol = True
  203     for Result in Results:
  204         MolCount += 1
  205         MolIndex, EncodedMol, EncodedMolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus  = Result
  206         
  207         if EncodedMol is None:
  208             continue
  209         ValidMolCount += 1
  210 
  211         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
  212         
  213         if FirstMol:
  214             FirstMol = False
  215             if SetSMILESMolProps:
  216                 if Writer is not None:
  217                     RDKitUtil.SetWriterMolProps(Writer, Mol)
  218         
  219         if not OptionsInfo["RxnSMARTSMode"]:
  220             if SearchPatternMissingStatus:
  221                 SearchPatternMissingCount += 1
  222                 WriteMolecule(Writer, Mol, Compute2DCoords)
  223                 continue
  224         
  225         if not AnaloguesGenerationStatus:
  226             AnaloguesGenerationFailedCount += 1
  227             WriteMolecule(Writer, Mol, Compute2DCoords)
  228             continue
  229         
  230         MolAnalogues = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMolAnalogue) for EncodedMolAnalogue in EncodedMolAnalogues]
  231         
  232         # Write out molecues and analogues...
  233         WriteMolAnalogues(Writer, Mol, MolCount, MolAnalogues, Compute2DCoords)
  234         
  235         AnaloguesCount += len(MolAnalogues)
  236     
  237     return (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount)
  238     
  239 def InitializeWorkerProcess(*EncodedArgs):
  240     """Initialize data for a worker process."""
  241     
  242     global Options, OptionsInfo
  243 
  244     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
  245 
  246     # Decode Options and OptionInfo...
  247     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
  248     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
  249     
  250     # Initialize search and target  patterns info...
  251     SetupSearchAndTargetPatternsInfo()
  252 
  253 def WorkerProcess(EncodedMolInfo):
  254     """Process data for a worker process."""
  255 
  256     MolIndex, EncodedMol = EncodedMolInfo
  257 
  258     MolAnalogues = None
  259     SearchPatternMissingStatus = False
  260     AnaloguesGenerationStatus = True
  261     
  262     if EncodedMol is None:
  263         return [MolIndex, None, MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus]
  264 
  265     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
  266     if RDKitUtil.IsMolEmpty(Mol):
  267         if not OptionsInfo["QuietMode"]:
  268             MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
  269             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
  270         return [MolIndex, None, MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus]
  271 
  272     MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus = GenerateMolAnalogues(Mol, (MolIndex +1))
  273 
  274     EncodedMolAnalogues = None
  275     if MolAnalogues is not None:
  276         EncodedMolAnalogues = [RDKitUtil.MolToBase64EncodedMolString(MolAnalogue, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.PrivateProps) for MolAnalogue in MolAnalogues]
  277     
  278     return [MolIndex, EncodedMol, EncodedMolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus]
  279 
  280 def GenerateMolAnalogues(Mol, MolNum = None):
  281     """Generate analogues."""
  282 
  283     if OptionsInfo["AttachAtomsMode"] or OptionsInfo["ReplaceAtomsMode"] or OptionsInfo["AttachSMILESMode"]:
  284         return GenerateMolAnaloguesForMultipleScanModes(Mol, MolNum)
  285     elif OptionsInfo["RxnSMARTSMode"]:
  286         return GenerateMolAnaloguesForRxnSMARTSMode(Mol, MolNum)
  287     elif OptionsInfo["TandemScanMode"]:
  288         return GenerateMolAnaloguesForTandemScanMode(Mol, MolNum)
  289     else:
  290         MiscUtil.PrintError("Failed to generate analogues.  The mode value, %s, specified using \"-m, --mode\" is not supported." % (OptionsInfo["Mode"]))
  291 
  292 def GenerateMolAnaloguesForMultipleScanModes(Mol, MolNum = None):
  293     """Generate analogues for the following scan modes: AttachAtom, ReplaceAtom
  294     or AttachSMILES."""
  295 
  296     MolAnalogues = None
  297     SearchPatternMissingStatus = False
  298     AnaloguesGenerationStatus = True
  299 
  300     # Retrieve matched atoms...
  301     MolMatchedAtomIndices = SetupMolMatchedAtomIndices(Mol, MolNum)
  302     if MolMatchedAtomIndices is None:
  303         SearchPatternMissingStatus = True
  304         return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
  305 
  306     MolAnalogues = []
  307     MolAnaloguesSet = set()
  308 
  309     TargetPatternInfo = OptionsInfo["TargetPatternInfo"]
  310     for TargetPatternIndex, TargetPatternObject in enumerate(TargetPatternInfo["PatternObjects"]):
  311         for AtomIndicesCombination in combinations(MolMatchedAtomIndices, OptionsInfo["ComboSearchAtoms"]):
  312             MolAnalogue = Chem.RWMol(Mol)
  313         
  314             for AtomIndex in AtomIndicesCombination:
  315                 if OptionsInfo["AttachAtomsMode"]:
  316                     AtomicNumber = TargetPatternObject
  317                     MolAnalogue = AttachAtom(MolAnalogue, AtomIndex, AtomicNumber, BondOrder = Chem.rdchem.BondType.SINGLE)
  318                 elif OptionsInfo["ReplaceAtomsMode"]:
  319                     AtomicNumber = TargetPatternObject
  320                     MolAnalogue = ReplaceAtom(MolAnalogue, AtomIndex, AtomicNumber)
  321                 elif OptionsInfo["AttachSMILESMode"]:
  322                     AttachAtomIndex = MolAnalogue.GetNumAtoms() - 1 + TargetPatternInfo["AttachSMILESAtomIndexOffsets"][TargetPatternIndex]
  323                     SMILESMol = TargetPatternObject
  324                     MolAnalogue = AttachMol(MolAnalogue, AtomIndex, SMILESMol, AttachAtomIndex, BondOrder = Chem.rdchem.BondType.SINGLE)
  325                 
  326             if  SanitizeMolAnalogue(MolAnalogue, MolNum):
  327                 TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues)
  328         
  329     if len(MolAnalogues) == 0:
  330         AnaloguesGenerationStatus = False
  331         
  332     return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
  333 
  334 def GenerateMolAnaloguesForRxnSMARTSMode(Mol, MolNum = None):
  335     """Generate analogues for reaction SMARTS mode."""
  336 
  337     SearchPatternMissingStatus = False
  338     AnaloguesGenerationStatus = True
  339 
  340     MolAnalogues = []
  341     MolAnaloguesSet = set()
  342     
  343     TargetPatternInfo = OptionsInfo["TargetPatternInfo"]
  344     for Rxn in TargetPatternInfo["PatternObjects"]:
  345         RxnProducts = Rxn.RunReactants((Mol,))
  346         
  347         if not len(RxnProducts):
  348             continue
  349         
  350         for RxnProduct in RxnProducts:
  351             MolAnalogue = RxnProduct[0]
  352             
  353             if  SanitizeMolAnalogue(MolAnalogue, MolNum):
  354                 TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues)
  355     
  356     if len(MolAnalogues) == 0:
  357         AnaloguesGenerationStatus = False
  358         
  359     return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
  360 
  361 def GenerateMolAnaloguesForTandemScanMode(Mol, MolNum = None):
  362     """Generate analogues for tandem scan mode."""
  363 
  364     MolAnalogues = None
  365     SearchPatternMissingStatus = False
  366     AnaloguesGenerationStatus = True
  367 
  368     # Retrieve matched atoms...
  369     MolMatchedAtomIndices = SetupMolMatchedAtomIndices(Mol, MolNum)
  370     if MolMatchedAtomIndices is None:
  371         SearchPatternMissingStatus = True
  372         return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
  373 
  374     TargetPatternTandemScanInfo = OptionsInfo["TargetPatternTandemScanInfo"]
  375     NumOfTandemOperations = 0
  376     if TargetPatternTandemScanInfo is not None and len(TargetPatternTandemScanInfo):
  377         NumOfTandemOperations = len(TargetPatternTandemScanInfo[0]["Names"])
  378     
  379     if  len(MolMatchedAtomIndices) < NumOfTandemOperations:
  380         if not OptionsInfo["QuietMode"]:
  381             MolName = RDKitUtil.GetMolName(Mol, MolNum)
  382             MiscUtil.PrintWarning("Number of matched search pattern atoms, %s, must be >= number of specified operations, %s, during TandemScan mode for molecule %s..." % (len(MolMatchedAtomIndices), NumOfTandemOperations, MolName))
  383         SearchPatternMissingStatus = True
  384         return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
  385     
  386     MolAnalogues = []
  387     MolAnaloguesSet = set()
  388     for AtomIndicesCombination in combinations(MolMatchedAtomIndices, NumOfTandemOperations):
  389         AtomIndicesPermutations = []
  390         if OptionsInfo["PermuteTandemScan"]:
  391             for AtomIndicesPermutation in permutations(AtomIndicesCombination, NumOfTandemOperations):
  392                 AtomIndicesPermutations.append(list(AtomIndicesPermutation))
  393         else:
  394             AtomIndicesPermutations.append(list(AtomIndicesCombination))
  395         
  396         for AtomIndicesPermutation in AtomIndicesPermutations:
  397             for PatternInfo in TargetPatternTandemScanInfo:
  398                 MolAnalogue = Chem.RWMol(Mol)
  399                 
  400                 for NameIndex, Name in enumerate(PatternInfo["Names"]):
  401                     AtomIndex = AtomIndicesPermutation[NameIndex]
  402                     PatternObject = PatternInfo["PatternObjects"][NameIndex]
  403                     
  404                     if re.match("^AttachAtoms$", Name, re.I):
  405                         MolAnalogue = AttachAtom(MolAnalogue, AtomIndex, PatternObject, BondOrder = Chem.rdchem.BondType.SINGLE)
  406                     elif re.match("^ReplaceAtoms$", Name, re.I):
  407                         MolAnalogue = ReplaceAtom(MolAnalogue, AtomIndex, PatternObject)
  408                     elif re.match("^AttachSMILES$", Name, re.I):
  409                         AttachAtomIndex = MolAnalogue.GetNumAtoms() - 1 + PatternInfo["AttachSMILESAtomIndexOffsets"][NameIndex]
  410                         MolAnalogue = AttachMol(MolAnalogue, AtomIndex, PatternObject, AttachAtomIndex, BondOrder = Chem.rdchem.BondType.SINGLE)
  411                         MolAnalogue = Chem.RWMol(MolAnalogue)
  412                     
  413                 if not SanitizeMolAnalogue(MolAnalogue, MolNum):
  414                     continue
  415                 
  416                 TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues)
  417             
  418     if len(MolAnalogues) == 0:
  419         AnaloguesGenerationStatus = False
  420         
  421     return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
  422 
  423 def TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues):
  424     """Track analogues."""
  425     
  426     if OptionsInfo["UniqueAnalogues"]:
  427         AnalogueSMILES = Chem.MolToSmiles(Chem.RemoveHs(MolAnalogue))
  428         if AnalogueSMILES not in MolAnaloguesSet:
  429             MolAnaloguesSet.add(AnalogueSMILES)
  430             MolAnalogues.append(MolAnalogue)
  431     else:
  432         MolAnalogues.append(MolAnalogue)
  433     
  434 def ReplaceAtom(Mol, AtomIndex, AtomicNumber):
  435     """Replace an existing atom with a new atom in a molecule."""
  436     
  437     Atom = Mol.GetAtomWithIdx(AtomIndex)
  438     Atom.SetAtomicNum(AtomicNumber)
  439 
  440     return Mol
  441 
  442 def AttachAtom(Mol, AtomIndex, AtomicNumber, BondOrder = Chem.rdchem.BondType.SINGLE):
  443     """Attach atom to an existing atom in a molecule."""
  444     
  445     NewAtomIndex = Mol.AddAtom(Chem.Atom(AtomicNumber))
  446     Mol.AddBond(AtomIndex, NewAtomIndex, BondOrder)
  447 
  448     return Mol
  449 
  450 def AttachMol(Mol, AtomIndex, AttachMol, AttachAtomIndex, BondOrder = Chem.rdchem.BondType.SINGLE):
  451     """Attach molecule to an exisiting molecule."""
  452     
  453     NewMol = Chem.CombineMols(Mol, AttachMol)
  454     
  455     NewMol = Chem.EditableMol(NewMol)
  456     NewMol.AddBond(AtomIndex, AttachAtomIndex, order = BondOrder)
  457     NewMol = NewMol.GetMol()
  458     
  459     return NewMol
  460 
  461 def SanitizeMolAnalogue(Mol, MolNum):
  462     """Sanitize analogue and catch any errors."""
  463     
  464     Status = True
  465     try:
  466         Chem.SanitizeMol(Mol)
  467     except Exception as ErrMsg:
  468         if not OptionsInfo["QuietMode"]:
  469             MolName = RDKitUtil.GetMolName(Mol, MolNum)
  470             MiscUtil.PrintInfo("\n%s\nFailed to sanitize analogue of molecule %s...\nIgnoring analogue..." % (ErrMsg, MolName))
  471         Status = False
  472 
  473     return Status
  474 
  475 def SetupMolMatchedAtomIndices(Mol, MolNum):
  476     """Setup a list of matched atom indices corresponding to first atom in search
  477     SMARTS pattern for a molecule."""
  478 
  479     MatchedAtoms = []
  480     
  481     # Retrieve first atom match for PAS...
  482     MatchedAtomIndices = [AtomIndices[0] for AtomIndices in Mol.GetSubstructMatches(OptionsInfo["SearchPatternMol"])]
  483     if len(MatchedAtomIndices) == 0:
  484         if not OptionsInfo["QuietMode"]:
  485             MolName = RDKitUtil.GetMolName(Mol, MolNum)
  486             MiscUtil.PrintWarning("No search pattern match for molecule %s..." % MolName)
  487         return None
  488     
  489     return MatchedAtomIndices
  490 
  491 def WriteMolAnalogues(Writer, Mol, MolNum, MolAnalogues, Compute2DCoords):
  492     """Write molecule analogues."""
  493 
  494     # Write out molecule...
  495     WriteMolecule(Writer, Mol, Compute2DCoords)
  496     
  497     if MolAnalogues is None:
  498         return
  499 
  500     # Write out analogues...
  501     for Index, MolAnalogue in enumerate(MolAnalogues):
  502         MolName = RDKitUtil.GetMolName(Mol, MolNum)
  503         SetMolAnalogueName(MolAnalogue, MolName, (Index + 1))
  504 
  505         WriteMolecule(Writer, MolAnalogue, Compute2DCoords)
  506     
  507 def SetMolAnalogueName(MolAnalogue, MolName, AnalogueCount):
  508     """Set analogue name."""
  509     
  510     NameSuffix = OptionsInfo["NameSuffix"]
  511     AnalogueName = "%s_%s%d" % (MolName, NameSuffix, AnalogueCount)
  512     MolAnalogue.SetProp("_Name", AnalogueName)
  513 
  514 def WriteMolecule(Writer, Mol, Compute2DCoords):
  515     """Write out molecule."""
  516     
  517     if Compute2DCoords:
  518         AllChem.Compute2DCoords(Mol)
  519     
  520     Writer.write(Mol)
  521 
  522 def SetupSearchAndTargetPatternsInfo():
  523     """Setup search and target patterns info."""
  524 
  525     # Setup search pattern molecule...
  526     SearchPattern = OptionsInfo["SearchPattern"]
  527     SearchPatternMol = None
  528     if SearchPattern is not None:
  529         SearchPatternMol = Chem.MolFromSmarts(SearchPattern)
  530         if SearchPatternMol is None:
  531             MiscUtil.PrintError("Failed to create search pattern molecule. The torsion SMILES/SMARTS pattern, \"%s\", specified using \"-s, --searchPattern\" option is not valid." % (SearchPattern))
  532     OptionsInfo["SearchPatternMol"] = SearchPatternMol
  533 
  534     # Setup target pattern information...
  535     Mode = OptionsInfo["Mode"]
  536     TargetPattern = OptionsInfo["TargetPattern"]
  537 
  538     TargetPatternInfo = OptionsInfo["TargetPatternInfo"]
  539     if TargetPatternInfo is not None:
  540         TargetPatternInfo["PatternObjects"] = []
  541         TargetPatternInfo["AttachSMILESAtomIndexOffsets"] = []
  542             
  543     if re.match("^(AttachAtoms|ReplaceAtoms)$", Mode, re.I):
  544         for Pattern in TargetPatternInfo["Patterns"]:
  545             TargetPatternInfo["PatternObjects"].append(Chem.GetPeriodicTable().GetAtomicNumber(Pattern))
  546             TargetPatternInfo["AttachSMILESAtomIndexOffsets"].append(None)
  547     elif re.match("^AttachSMILES$", Mode, re.I):
  548         for Pattern in TargetPatternInfo["Patterns"]:
  549             SMILESMol = Chem.MolFromSmiles(Pattern)
  550             AttachAtomIndexOffset = GetAttachSMILESAtomIndexOffset(SMILESMol)
  551             
  552             TargetPatternInfo["PatternObjects"].append(SMILESMol)
  553             TargetPatternInfo["AttachSMILESAtomIndexOffsets"].append(AttachAtomIndexOffset)
  554     elif re.match("^RxnSMARTS$", Mode, re.I):
  555         for Pattern in TargetPatternInfo["Patterns"]:
  556             TargetPatternInfo["PatternObjects"].append(AllChem.ReactionFromSmarts(Pattern))
  557             TargetPatternInfo["AttachSMILESAtomIndexOffsets"].append(None)
  558     elif re.match("^TandemScan$", Mode, re.I):
  559         TargetPatternTandemScanInfo = OptionsInfo["TargetPatternTandemScanInfo"]
  560         for PatternInfo in TargetPatternTandemScanInfo:
  561             PatternInfo["PatternObjects"] = []
  562             PatternInfo["AttachSMILESAtomIndexOffsets"] = []
  563             for NameIndex, Name in enumerate(PatternInfo["Names"]):
  564                 Pattern = PatternInfo["Pattern"][NameIndex]
  565                 if re.match("^(AttachAtoms|ReplaceAtoms)$", Name, re.I):
  566                     PatternInfo["PatternObjects"].append(Chem.GetPeriodicTable().GetAtomicNumber(Pattern))
  567                     PatternInfo["AttachSMILESAtomIndexOffsets"].append(None)
  568                 elif re.match("^AttachSMILES$", Name, re.I):
  569                     SMILESMol = Chem.MolFromSmiles(Pattern)
  570                     AttachAtomIndexOffset = GetAttachSMILESAtomIndexOffset(SMILESMol)
  571                     
  572                     PatternInfo["PatternObjects"].append(SMILESMol)
  573                     PatternInfo["AttachSMILESAtomIndexOffsets"].append(AttachAtomIndexOffset)
  574 
  575 def GetAttachSMILESAtomIndexOffset(Mol):
  576     """Get attach atom index offset and optionally clear atom map numbers."""
  577     
  578     AttachAtomIndexOffset = 1
  579     
  580     AtomMapIndices  = RDKitUtil.GetAtomMapIndices(Mol)
  581     if AtomMapIndices is not None and len(AtomMapIndices):
  582         # Use the atom index corresponding to the lowest atom map number
  583         # as the attachement atom. The AtomMapIndices list contains atom indices
  584         # sorted by atom map numbers.
  585         
  586         AttachAtomIndexOffset = AtomMapIndices[0] + 1
  587         
  588         if OptionsInfo["ClearAtomMapNumbers"]:
  589             RDKitUtil.ClearAtomMapNumbers(Mol)
  590 
  591     return AttachAtomIndexOffset
  592     
  593 def ProcessOptions():
  594     """Process and validate command line arguments and options."""
  595     
  596     MiscUtil.PrintInfo("Processing options...")
  597     
  598     # Validate options...
  599     ValidateOptions()
  600 
  601     OptionsInfo["ComboSearchAtoms"] = int(Options["--comboSearchAtoms"])
  602     OptionsInfo["ClearAtomMapNumbers"] = True if re.match("^yes$", Options["--clearAtomMapNumbers"], re.I) else False
  603 
  604     OptionsInfo["Infile"] = Options["--infile"]
  605     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
  606     
  607     OptionsInfo["Outfile"] = Options["--outfile"]
  608     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
  609     OptionsInfo["Overwrite"] = Options["--overwrite"]
  610 
  611     OptionsInfo["Mode"] = Options["--mode"]
  612     (AttachAtomsMode, ReplaceAtomsMode, AttachSMILESMode, RxnSMARTSMode, TandemScanMode) = [False] * 5
  613     Mode = OptionsInfo["Mode"]
  614     if re.match("^AttachAtoms$", Mode, re.I):
  615         AttachAtomsMode = True
  616     elif re.match("^ReplaceAtoms$", Mode, re.I):
  617         ReplaceAtomsMode = True
  618     elif re.match("^AttachSMILES$", Mode, re.I):
  619         AttachSMILESMode = True
  620     elif re.match("^RxnSMARTS$", Mode, re.I):
  621         RxnSMARTSMode = True
  622     elif re.match("^TandemScan$", Mode, re.I):
  623         TandemScanMode = True
  624     else:
  625         MiscUtil.PrintError("The mode value , %s, specified using \"-m, --mode\" option is not valid" % (Mode))
  626     OptionsInfo["AttachAtomsMode"] = AttachAtomsMode
  627     OptionsInfo["ReplaceAtomsMode"] = ReplaceAtomsMode
  628     OptionsInfo["AttachSMILESMode"] = AttachSMILESMode
  629     OptionsInfo["RxnSMARTSMode"] = RxnSMARTSMode
  630     OptionsInfo["TandemScanMode"] = TandemScanMode
  631 
  632     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
  633     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
  634     
  635     OptionsInfo["NameSuffix"] = Options["--nameSuffix"]
  636     OptionsInfo["PermuteTandemScan"] = True if re.match("^yes$", Options["--permuteTandemScan"], re.I) else False
  637     OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False
  638     
  639     OptionsInfo["UniqueAnalogues"] = True if re.match("^yes$", Options["--uniqueAnalogues"], re.I) else False
  640     
  641     # Process and validate search pattern...
  642     SearchPattern = re.sub(" ", "", Options["--searchPattern"])
  643     if MiscUtil.IsEmpty(SearchPattern):
  644         MiscUtil.PrintError("No value specified using \"-s, --searchPattern\" option.")
  645     if re.match("^auto$", SearchPattern, re.I):
  646         if re.match("^(AttachAtoms|ReplaceAtoms|AttachSMILES|TandemScan)$", Mode, re.I):
  647             SearchPattern = "[cH]"
  648         else:
  649             SearchPattern = None
  650     else:
  651         if re.match("^RxnSMARTS$", Mode, re.I):
  652             MiscUtil.PrintError("The specification of SMARTS search pattern, %s,  using \"-s, --searchPattern\" option is not allowed during, %s, value of \"-m, --mode\"." % (SearchPattern, Mode))
  653 
  654     if SearchPattern is not None:
  655         SearchPatternMol = Chem.MolFromSmarts(SearchPattern)
  656         if SearchPatternMol is None:
  657             MiscUtil.PrintError("Failed to create search pattern molecule. The torsion SMILES/SMARTS pattern, \"%s\", specified using \"-s, --searchPattern\" option is not valid." % (SearchPattern))
  658     OptionsInfo["SearchPattern"] = SearchPattern
  659         
  660     # Process and validate target pattern...
  661     TargetPattern = Options["--targetPattern"]
  662     if MiscUtil.IsEmpty(TargetPattern):
  663         MiscUtil.PrintError("No value specified using \"-t, --targetPattern\" option.")
  664     TargetPattern = TargetPattern.strip()
  665         
  666     if re.match("^auto$", TargetPattern, re.I):
  667         # Setup default target pattern...
  668         if re.match("^AttachAtoms$", Mode, re.I):
  669             TargetPattern = "F"
  670         elif re.match("^ReplaceAtoms$", Mode, re.I):
  671             TargetPattern = "N"
  672         elif re.match("^AttachSMILES$", Mode, re.I):
  673             TargetPattern = "C(F)(F)(F)"
  674         elif re.match("^RxnSMARTS$", Mode, re.I):
  675             TargetPattern = "[cH:1]>>[N:1]"
  676         elif re.match("^TandemScan$", Mode, re.I):
  677             TargetPattern = "ReplaceAtoms,N,AttachAtoms,F"
  678         else:
  679             MiscUtil.PrintError("Failed to setup default target pattern for mode, %s, specified using \"-m, --mode\"" % (Mode))
  680     
  681     TargetPatternInfo = None
  682     TargetPatternTandemScanInfo = None
  683     
  684     if re.match("^(AttachAtoms|ReplaceAtoms)$", Mode, re.I):
  685         TargetPatternInfo = InitializeTargetPatternInfo(TargetPattern)
  686         for ElementSymbol in TargetPatternInfo["Patterns"]:
  687             if not RDKitUtil.IsValidElementSymbol(ElementSymbol):
  688                 MiscUtil.PrintError("The target pattern, \"%s\", specified using \"-t, --targetPattern\" must be a valid atomic symbol during, %s, value of \"-m, --mode\" option." % (ElementSymbol, Mode))
  689     elif re.match("^AttachSMILES$", Mode, re.I):
  690         TargetPatternInfo = InitializeTargetPatternInfo(TargetPattern)
  691         for SMILES in TargetPatternInfo["Patterns"]:
  692             ValidateSMILESTargetPattern(SMILES, Mode)
  693     elif re.match("^RxnSMARTS$", Mode, re.I):
  694         TargetPatternInfo = InitializeTargetPatternInfo(TargetPattern)
  695         for RxnPattern in TargetPatternInfo["Patterns"]:
  696             try:
  697                 Rxn = AllChem.ReactionFromSmarts(RxnPattern)
  698             except Exception as ErrMsg:
  699                 MiscUtil.PrintError("The target pattern, \"%s\", specified using \"-t, --targetPattern\" must be a valid RxnSMARTS  during, %s, value of \"-m, --mode\" option.\nErrMsg: %s" % (RxnPattern, Mode, ErrMsg))
  700     elif re.match("^TandemScan$", Mode, re.I):
  701         TargetPatternTandemScanInfo = ProcessTandemScanTargetPattern(TargetPattern)
  702 
  703     # Track target pattern info...
  704     OptionsInfo["TargetPattern"] = TargetPattern
  705     OptionsInfo["TargetPatternInfo"] = TargetPatternInfo
  706     OptionsInfo["TargetPatternTandemScanInfo"] = TargetPatternTandemScanInfo
  707     
  708 def InitializeTargetPatternInfo(TargetPattern):
  709     """Initialize target pattern information."""
  710     
  711     TargetPatternInfo = {}
  712     TargetPatternInfo["Patterns"]  = TargetPattern.split()
  713 
  714     return TargetPatternInfo
  715 
  716 def ProcessTandemScanTargetPattern(TargetPattern):
  717     """Process target pattern for tandem scan."""
  718 
  719     # Validate target pattern...
  720     ValidatedTargetPatternInfo = ValidateTandemScanTargetPattern(TargetPattern)
  721 
  722     # Expand target patterns to cover various combinations...
  723     #
  724     # For example: "ReplaceAtoms,N,Attach Atoms, F Cl" expands to
  725     # "ReplaceAtoms,N,AttachAtoms,F" and "ReplaceAtoms,N,AttachAtoms,Cl"
  726     #
  727     ExpandedPatterns = None
  728     for NameIndex, Name in enumerate(ValidatedTargetPatternInfo["Names"]):
  729         Patterns = ValidatedTargetPatternInfo["Patterns"][NameIndex]
  730         
  731         if ExpandedPatterns is None:
  732             ExpandedPatterns = []
  733             for Pattern in Patterns:
  734                 ExpandedPatterns.append([Name, Pattern])
  735             continue
  736 
  737         NewExpandedPatterns = []
  738         for ExpandedPattern in  ExpandedPatterns:
  739             for Pattern in Patterns:
  740                 NewExpandedPattern = []
  741                 NewExpandedPattern.extend(ExpandedPattern)
  742                 
  743                 NewExpandedPattern.append(Name)
  744                 NewExpandedPattern.append(Pattern)
  745                 
  746                 NewExpandedPatterns.append(NewExpandedPattern)
  747         
  748         ExpandedPatterns = NewExpandedPatterns
  749 
  750     # Track target pattern info...
  751     TargetPatternTandemScanInfo = []
  752     for ExpandedPattern in ExpandedPatterns:
  753         PatternInfo = {}
  754         PatternInfo["Names"] = []
  755         PatternInfo["Pattern"] = []
  756 
  757         for Index in range(0, len(ExpandedPattern), 2):
  758             Name = ExpandedPattern[Index].strip()
  759             Pattern = ExpandedPattern[Index + 1].strip()
  760             PatternInfo["Names"].append(Name)
  761             PatternInfo["Pattern"].append(Pattern)
  762 
  763         TargetPatternTandemScanInfo.append(PatternInfo)
  764 
  765     return TargetPatternTandemScanInfo
  766     
  767 def ValidateTandemScanTargetPattern(TargetPattern):
  768     """Validate target pattern for tandem scan."""
  769 
  770     ValidatedTargetPatternInfo = {}
  771     ValidatedTargetPatternInfo["Names"] = []
  772     ValidatedTargetPatternInfo["Patterns"] = []
  773 
  774     Mode = OptionsInfo["Mode"]
  775     
  776     TargetPatternWords = TargetPattern.split(",")
  777     if len(TargetPatternWords) % 2:
  778         MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"-t --targetPattern\" option must be an even number during, %s, value of \"-m, --mode\" option." % (len(TargetPatternWords), Mode))
  779     
  780     ValidOperationNames = ["AttachAtoms", "ReplaceAtoms", "AttachSMILES"]
  781     CanonicalOperationNamesMap = {}
  782     for OperationName in ValidOperationNames:
  783         CanonicalOperationNamesMap[OperationName.lower()] = OperationName
  784     
  785     # Validate PAS operation name and value pairs...
  786     for Index in range(0, len(TargetPatternWords), 2):
  787         OperationName = TargetPatternWords[Index].strip()
  788         OperationPattern = TargetPatternWords[Index + 1].strip()
  789         
  790         CanonicalOperationName = OperationName.lower()
  791         if  not CanonicalOperationName in CanonicalOperationNamesMap:
  792             MiscUtil.PrintError("The operation name, %s, specified using \"-t, --targetPattern\" is not a valid during, %s, value of \"-m, --mode\" option. Supported operation  names: %s" % (OperationName, Mode, " ".join(ValidOperationNames)))
  793             
  794         if not OperationPattern:
  795             MiscUtil.PrintError("Empty target pattern specified using \"-t, --targetPattern\" must be a valid atomic symbol for operation, %s, during, %s, value of \"-m, --mode\" option." % (OperationName, Mode))
  796 
  797         Name = CanonicalOperationNamesMap[CanonicalOperationName]
  798         OperationPatterns = OperationPattern.split()
  799         if re.match("^(AttachAtoms|ReplaceAtoms)$", OperationName, re.I):
  800             for Pattern in OperationPatterns:
  801                 if not RDKitUtil.IsValidElementSymbol(Pattern):
  802                     MiscUtil.PrintError("The target pattern, \"%s\", specified using \"-t, --targetPattern\" must be a valid atomic symbol for operation, %s, during, %s, value of \"-m, --mode\" option." % (Pattern, OperationName, Mode))
  803         elif re.match("^AttachSMILES$", OperationName, re.I):
  804             for Pattern in OperationPatterns:
  805                 ValidateSMILESTargetPattern(Pattern, Mode, OperationName)
  806 
  807         ValidatedTargetPatternInfo["Names"].append(Name)
  808         ValidatedTargetPatternInfo["Patterns"].append(OperationPatterns)
  809 
  810     return ValidatedTargetPatternInfo
  811 
  812 def ValidateSMILESTargetPattern(TargetPattern, Mode, OperationName = None):
  813     """Validate SMILES pattern."""
  814 
  815     Mol = Chem.MolFromSmiles(TargetPattern)
  816     if Mol is None:
  817         if OperationName is None:
  818             MiscUtil.PrintError("The target pattern, \"%s\", specified using \"-t, --targetPattern\" must be a valid SMILES during, %s, value of \"-m, --mode\" option." % (TargetPattern, Mode))
  819         else:
  820             MiscUtil.PrintError("The target pattern, \"%s\", specified using \"-t, --targetPattern\" must be a valid SMILES for operation, %s, during, %s, value of \"-m, --mode\" option." % (TargetPattern, OperationName, Mode))
  821 
  822 def RetrieveOptions():
  823     """Retrieve command line arguments and options."""
  824     
  825     # Get options...
  826     global Options
  827     Options = docopt(_docoptUsage_)
  828 
  829     # Set current working directory to the specified directory...
  830     WorkingDir = Options["--workingdir"]
  831     if WorkingDir:
  832         os.chdir(WorkingDir)
  833     
  834     # Handle examples option...
  835     if "--examples" in Options and Options["--examples"]:
  836         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
  837         sys.exit(0)
  838 
  839 def ValidateOptions():
  840     """Validate option values."""
  841     
  842     MiscUtil.ValidateOptionIntegerValue("-c, --comboSearchAtoms", Options["--comboSearchAtoms"], {">": 0})
  843     MiscUtil.ValidateOptionTextValue("--clearAtomMapNumbers", Options["--clearAtomMapNumbers"], "yes no")
  844     
  845     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
  846     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi txt csv tsv")
  847 
  848     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi txt csv tsv")
  849     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
  850     MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
  851     
  852     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "ReplaceAtoms AttachAtoms AttachSMILES RxnSMARTS TandemScan")
  853     
  854     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
  855     MiscUtil.ValidateOptionTextValue("-p, --permuteTandemScan", Options["--permuteTandemScan"], "yes no")
  856     MiscUtil.ValidateOptionTextValue("-q, --quiet", Options["--quiet"], "yes no")
  857     
  858     MiscUtil.ValidateOptionTextValue(" -u, --uniqueAnalogues", Options["--uniqueAnalogues"], "yes no")
  859 
  860 # Setup a usage string for docopt...
  861 _docoptUsage_ = """
  862 RDKitPerformPositionalAnalogueScan.py - Positional analogue scanning.
  863 
  864 Usage:
  865     RDKitPerformPositionalAnalogueScan.py [--comboSearchAtoms <number>] [--clearAtomMapNumbers <yes or no>]
  866                                           [--infileParams <Name,Value,...>] [--mode <ReplaceAtoms, AttachAtoms, AttachSMILES, RxnSMARTS, TandemScan>]
  867                                           [--mp <yes or no>] [--mpParams <Name,Value,...>] [ --nameSuffix <text>] [ --outfileParams <Name,Value,...> ]
  868                                           [--overwrite] [--permuteTandemScan <yes or no>] [--quiet <yes or no>] [--searchPattern <SMARTSPattern>]
  869                                           [--targetPattern <ElementSymbol, SMILES, RxnSMARTS,...>] [--uniqueAnalogues <yes or no> ]
  870                                           [-w <dir>] -i <infile>  -o <outfile> 
  871     RDKitPerformPositionalAnalogueScan.py -h | --help | -e | --examples
  872 
  873 Description:
  874     Perform Positional Analogue Scanning (PAS) to generate analogues of molecules
  875     by applying chemical transformations to molecules [ Ref 147-148 ]. The chemical
  876     transformations are defined using SMARTS, element symbols, SMILES, and 
  877     RxnSMARTS. Four different types of chemical transformations are available for
  878     for generating analogues of molecules: replace atoms, attach atoms, attach SMILES,
  879     and RxnSMARTS. Tandem positional analogue scanning may be performed by the
  880     concurrent application of multiple chemical transformations.
  881     
  882     A SMARTS search pattern identifies atoms in molecules for attachment or
  883     replacement points during positional analogue scanning. It may retrieve multiple
  884     substructure matches in a molecule. The first matched atom in each substructure
  885     match comprises a set of attachment or replacement points.
  886     
  887     A target pattern encompasses information regarding element symbol, SMILES, and
  888     reaction SMARTS for replacing and attaching atoms, attaching SMILES, and applying
  889     reaction SMARTS. In addition, multiple concurrent chemical transformations may
  890     be specified during tandem positional analogue scanning. 
  891     
  892     The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi,
  893     .csv, .tsv, .txt)
  894 
  895     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi, .csv,
  896     .tsv, .txt)
  897 
  898 Options:
  899     -c, --comboSearchAtoms <number>  [default: 1]
  900         Number of concurrent search pattern match atoms to use as attachment or
  901         replacement points during positional analogue scanning in 'AttachAtoms', 
  902         'AttachSMILES', and 'ReplaceAtoms' modes. This value is ignored during 
  903         'RxnSMARTS' and 'TandemScan' values of  '-m, --mode' option.
  904     --clearAtomMapNumbers <yes or no>  [default: yes]
  905         Clear any atom map numbers in target SMILES specification for 'AttachSMILES'
  906         before generating analogues.
  907     -e, --examples
  908         Print examples.
  909     -h, --help
  910         Print this help message.
  911     -i, --infile <infile>
  912         Input file name.
  913     --infileParams <Name,Value,...>  [default: auto]
  914         A comma delimited list of parameter name and value pairs for reading
  915         molecules from files. The supported parameter names for different file
  916         formats, along with their default values, are shown below:
  917             
  918             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
  919             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
  920                 smilesTitleLine,auto,sanitize,yes
  921             
  922         Possible values for smilesDelimiter: space, comma or tab.
  923     -m, --mode <ReplaceAtoms, AttachAtoms,...>  [default: ReplaceAtoms]
  924         Type of operations to perform during positional analogue scanning. The
  925         supported values, along with a brief description, are shown below:
  926             
  927             Mode           Description
  928             ReplaceAtoms   Replace atoms with new atoms
  929             AttachAtoms    Attach new atoms to atoms
  930             AttachSMILES   Attach SMILES to atoms
  931             RxnSMARTS      Run reaction SMARTS
  932             TandemScan     Perform tandem scan by combining ReplaceAtoms,
  933                 AttachAtoms, and AttachSMILES                    
  934              
  935         The chemical transformations of input molecules is dependent on the
  936         values of '-s, --searchPattern' and  '-t, --targetPattern' options. For
  937         example, nitrogen-walk or nitrogen scan is performed by '[cH]' and 'N'
  938         values for '-s, --searchPattern' and  '-t, --targetPattern' options.
  939     --mp <yes or no>  [default: no]
  940         Use multiprocessing.
  941          
  942         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
  943         function employing lazy RDKit data iterable. This allows processing of
  944         arbitrary large data sets without any additional requirements memory.
  945         
  946         All input data may be optionally loaded into memory by mp.Pool.map()
  947         before starting worker processes in a process pool by setting the value
  948         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
  949         
  950         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
  951         data mode may adversely impact the performance. The '--mpParams' section
  952         provides additional information to tune the value of 'chunkSize'.
  953     --mpParams <Name,Value,...>  [default: auto]
  954         A comma delimited list of parameter name and value pairs to configure
  955         multiprocessing.
  956         
  957         The supported parameter names along with their default and possible
  958         values are shown below:
  959         
  960             chunkSize, auto
  961             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
  962             numProcesses, auto   [ Default: mp.cpu_count() ]
  963         
  964         These parameters are used by the following functions to configure and
  965         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
  966         mp.Pool.imap().
  967         
  968         The chunkSize determines chunks of input data passed to each worker
  969         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
  970         The default value of chunkSize is dependent on the value of 'inputDataMode'.
  971         
  972         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
  973         automatically converts RDKit data iterable into a list, loads all data into
  974         memory, and calculates the default chunkSize using the following method
  975         as shown in its code:
  976         
  977             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
  978             if extra: chunkSize += 1
  979         
  980         For example, the default chunkSize will be 7 for a pool of 4 worker processes
  981         and 100 data items.
  982         
  983         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
  984         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
  985         data into memory. Consequently, the size of input data is not known a priori.
  986         It's not possible to estimate an optimal value for the chunkSize. The default 
  987         chunkSize is set to 1.
  988         
  989         The default value for the chunkSize during 'Lazy' data mode may adversely
  990         impact the performance due to the overhead associated with exchanging
  991         small chunks of data. It is generally a good idea to explicitly set chunkSize to
  992         a larger value during 'Lazy' input data mode, based on the size of your input
  993         data and number of processes in the process pool.
  994         
  995         The mp.Pool.map() function waits for all worker processes to process all
  996         the data and return the results. The mp.Pool.imap() function, however,
  997         returns the the results obtained from worker processes as soon as the
  998         results become available for specified chunks of data.
  999         
 1000         The order of data in the results returned by both mp.Pool.map() and 
 1001         mp.Pool.imap() functions always corresponds to the input data.
 1002     -n, --nameSuffix <text>  [default: Analogue]
 1003         Name suffix for generating molecule names of analogues. Format of analogue
 1004         names: <MolName>_<NameSuffix>_<MolNum>
 1005     -o, --outfile <outfile>
 1006         Output file name.
 1007     --outfileParams <Name,Value,...>  [default: auto]
 1008         A comma delimited list of parameter name and value pairs for writing
 1009         molecules to files. The supported parameter names for different file
 1010         formats, along with their default values, are shown below:
 1011             
 1012             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 1013             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 1014                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 1015             
 1016     --overwrite
 1017         Overwrite existing files.
 1018     -p, --permuteTandemScan <yes or no>  [default: yes]
 1019         Permute atom positions matched by SMARTS search pattern in a molecule
 1020         to generate all possible analogues during tandem positional analogue
 1021         scanning.
 1022         
 1023         This option is only valid for 'TandemScan' value of '-m, --mode' option.
 1024     -q, --quiet <yes or no>  [default: no]
 1025         Use quiet mode. The warning and information messages will not be printed.
 1026     -s, --searchPattern <SMARTSPattern>  [default: auto]
 1027         SMARTS search pattern identifying atoms in molecules for attachment or
 1028         replacement points during positional analogue scanning. The SMARTS search
 1029         pattern may retrieve multiple substructure matches in a molecule. The first
 1030         matched atom in each substructure match comprises a set of attachment or
 1031         replacement points.
 1032         
 1033         The default values, dependent on the value of '-m, --mode' option, are
 1034         shown below:
 1035             
 1036             Mode            Default   Description
 1037             ReplaceAtoms    [cH]      Aromatic carbon  
 1038             AttachAtoms     [cH]      Aromatic carbon
 1039             AttachSMILES    [cH]      Aromatic carbon
 1040             RxnSMARTS       None      Not applicable
 1041             TandemScan      [cH]      Aromatic carbon
 1042             
 1043         This option is ignored during 'RxnSMARTS' value of '-m, --mode' option.
 1044     -t, --targetPattern <ElementSymbol, SMILES, RxnSMARTS...>  [default: auto]
 1045         Target pattern for performing chemical transformations during positional
 1046         analogue scanning. These values are used in conjunction with the value of
 1047         '-s, --searchPattern' to generate appropriate analogues.
 1048         
 1049         The default values, dependent on the values of '-m, --mode' and 
 1050         '-s, --searchPattern' options, are shown below:
 1051             
 1052             Mode            Default        Description
 1053             ReplaceAtoms    N              Element symbol for nitrogen
 1054             AttachAtoms     F              Element symbol for fluorine
 1055             AttachSMILES    C(F)(F)(F)     SMILES for CF3
 1056             RxnSMARTS       [cH:1]>>[N:1] Replace aromatic carbon by nitrogen
 1057             TandemScan      ReplaceAtoms,N,AttachAtoms,F  Replace and attach
 1058             
 1059         The target specifications may contain multiple space delimted values. For
 1060         example:
 1061             
 1062             Mode            Example
 1063             ReplaceAtoms    "N P"
 1064             AttachAtoms     "F Cl Br"
 1065             AttachSMILES    "C(F)(F)(F) CO"
 1066             RxnSMARTS       "[cH:1]>>[N:1] [cH:1]>>[P:1]"
 1067             TandemScan      "ReplaceAtoms,N P,AttachAtoms,F Cl"
 1068             
 1069         The target specification for 'AttachSMILES' may contain atom map numbers.
 1070         The attachment point corresponds to the matched atom with the lowest
 1071         atom map number. Otherwise, the first atom in SMILES target specification
 1072         represents the attachment point. For example:
 1073             
 1074             Mode            Example
 1075             AttachSMILES    "C(C)(C)(C) CO"
 1076             AttachSMILES    "C([C:1])(C)(C) C(F)(F)(F)"
 1077             
 1078         Multiple concurrent chemical transformations are allowed during 'TandemScan'. The
 1079         target pattern specification for 'TandemScan' is a comma delimited list of operation
 1080         type and target pattern. Format: OperationType,TargetPattern,...
 1081         
 1082         The supported operation types and target pattern for 'TandemScan' are shown
 1083         below:
 1084             
 1085             ReplaceAtoms,<ElementSymbol>
 1086             AttachAtoms,<ElementSymbol>
 1087             AttachSMILES,<SMILES> 
 1088             
 1089             ReplaceAtoms,"<ElementSymbol> <ElementSymbol>..."
 1090             AttachAtoms,"<ElementSymbol> <ElementSymbol>..."
 1091             AttachSMILES,"<SMILES> <SMIELS>..."
 1092             
 1093         For example:
 1094             
 1095             ReplaceAtoms,N,AttachAtoms,F
 1096             ReplaceAtoms,N,AttachAtoms,F,AttachSMILES,C(F)(F)(F) 
 1097             
 1098             "ReplaceAtoms,N,AttachAtoms,F Cl"
 1099             "ReplaceAtoms,N P,AttachAtoms,F Cl"
 1100             "ReplaceAtoms,N P,AttachAtoms,F Cl,AttachSMILES,C(F)(F)(F) CO" 
 1101                     
 1102         The number of chemical transformations  in 'TandemScan' must be less than or
 1103         equal to the total number atoms matched by SMARTS search pattern in a molecule.
 1104         Otherwise, it is not possible to perform a 'TandemScan'. The matched atom positions
 1105         may be optionally permuted to generate all possible analogues during  positional
 1106         analogue scanning using '-p, --permuteTandemScan' option.
 1107     -u, --uniqueAnalogues <yes or no>  [default: yes]
 1108         Keep only unique analogues of a molecule corresponding to unique SMILES
 1109         strings. The duplicate SMILES string may be generated during PAS due to
 1110         symmetric replacement or attachment points in molecules.
 1111     -w, --workingdir <dir>
 1112         Location of working directory which defaults to the current directory.
 1113 
 1114 Examples:
 1115     To perform a nitrogen-walk or nitrogen scan by replacing aromatic carbons by
 1116     nitrogens in molecules in a SMILES file and write out a SMILES file, type:
 1117 
 1118         % RDKitPerformPositionalAnalogueScan.py  -i Sample.smi -o SampleOut.smi
 1119 
 1120     To run the first example by explicitly specifying search and target patterns
 1121     for replacing aromatic carbons by nitogens in molecules in a SD file and write
 1122     out a SD file, type:
 1123 
 1124         % RDKitPerformPositionalAnalogueScan.py  -m ReplaceAtoms -s "[cH]"
 1125           -t "N" -i Sample.sdf -o SampleOut.sdf
 1126 
 1127     To run the previous example for replacing aromatic carbons by N and P, type:
 1128 
 1129         % RDKitPerformPositionalAnalogueScan.py  -m ReplaceAtoms -s "[cH]"
 1130           -t "N P" -i Sample.sdf -o SampleOut.sdf
 1131 
 1132     To run the first example in multiprocessing mode on all available CPUs
 1133     without loading all data into memory and write out a SD file, type:
 1134 
 1135         % RDKitPerformPositionalAnalogueScan.py  -i Sample.smi -o SampleOut.sdf
 1136           --mp yes
 1137      
 1138     To run the previous example in multiprocessing mode on all available CPUs
 1139     by loading all data into memory and write out a SD file, type:
 1140 
 1141         % RDKitPerformPositionalAnalogueScan.py  -i Sample.smi -o SampleOut.sdf
 1142           --mp yes --mpParams "inputDataMode,InMemory"
 1143      
 1144     To run the previous example in multiprocessing mode on specific number of
 1145     CPUs and chunk size without loading all data into memory and write out a SD file,
 1146     type:
 1147 
 1148         % RDKitPerformPositionalAnalogueScan.py  -i Sample.smi -o SampleOut.sdf
 1149           --mpParams "inputDataMode,Lazy,numProcesses,4,chunkSize,8"
 1150      
 1151     To perform positional analogue scanning by simultaneously attaching fluorines
 1152     to two aromatic carbons in molecules in a SMILES file and write out a SD file,
 1153     type:
 1154 
 1155         % RDKitPerformPositionalAnalogueScan.py  -m AttachAtoms -s "[cH]"
 1156           -t "F" -c 2 -i Sample.smi -o SampleOut.sdf
 1157 
 1158     To perform positional analogue scanning by attaching SMILES to aromatic
 1159     carbons in molecules in a SMILES file and write out a SD file, type:
 1160 
 1161         % RDKitPerformPositionalAnalogueScan.py  -m AttachSMILES -s "[cH]"
 1162           -t "F" -i Sample.smi -o SampleOut.sdf
 1163 
 1164     To run the previous example for attaching F and CF3 to aromatic carbons,
 1165     type:
 1166 
 1167         % RDKitPerformPositionalAnalogueScan.py  -m AttachSMILES -s "[cH]"
 1168           -t "F C(F)(F)(F)" -i Sample.smi -o SampleOut.sdf
 1169 
 1170     To run the previous example for attaching "C(C)(C)(C)" at a specific attachment
 1171     point corresponding to the lowest atom map number,  type:
 1172 
 1173         % RDKitPerformPositionalAnalogueScan.py  -m AttachSMILES -s "[cH]"
 1174           -t "C([C:1])([C:2])(C)" -i Sample.smi -o SampleOut.sdf
 1175 
 1176     To perform a nitrogen-walk or nitrogen scan by using reaction SMARTS to replace
 1177     aromatic carbons by nitrogens in molecules in a SMILES file and write out a  SMILES
 1178     file,  type:
 1179 
 1180         % RDKitPerformPositionalAnalogueScan.py  -m RxnSMARTS
 1181           -t "[cH:1]>>[N:1]" -i Sample.smi -o SampleOut.smi
 1182 
 1183     To perform a tandem positional analogue scan by concurrently applying multiple
 1184     chemical transformations to aromatic carbons, permute all matched search
 1185     atom positions during analogue generation, and write out a SD file, type:
 1186 
 1187         % RDKitPerformPositionalAnalogueScan.py  -m TandemScan -s "[cH]"
 1188           -t "ReplaceAtoms,N,AttachAtoms,F,AttachSMILES,OC"
 1189           -p yes  -i Sample.smi -o SampleOut.smi
 1190 
 1191     To run the previous example for attaching multiple atoms and SMILES to
 1192     aromatic carbons, type:
 1193 
 1194         % RDKitPerformPositionalAnalogueScan.py  -m TandemScan -s "[cH]"
 1195           -t "ReplaceAtoms,N,AttachAtoms,F Cl,AttachSMILES,C(F)(F)(F) OC"
 1196           -p yes  -i Sample.smi -o SampleOut.smi
 1197 
 1198     To perform a nitrogen-walk or nitrogen scan by replacing aromatic carbons by
 1199     nitrogens in molecules in a SMILES CSV fileS, MILES strings in column 1, name
 1200     in column 2, and write out a SD file, type:
 1201 
 1202         % RDKitPerformPositionalAnalogueScan.py  -m ReplaceAtoms -s "[cH]"
 1203           -t "N" --infileParams "smilesDelimiter,comma, smilesTitleLine,yes,
 1204           smilesColumn,1,smilesNameColumn,2"
 1205           -i SampleSMILES.csv -o SampleOut.sdf
 1206 
 1207 Author:
 1208     Manish Sud(msud@san.rr.com)
 1209 
 1210 See also:
 1211     RDKitConvertFileFormat.py, RDKitEnumerateCompoundLibrary.py,
 1212     RDKitPerformTorsionScan.py
 1213 
 1214 Copyright:
 1215     Copyright (C) 2024 Manish Sud. All rights reserved.
 1216 
 1217     The functionality available in this script is implemented using RDKit, an
 1218     open source toolkit for cheminformatics developed by Greg Landrum.
 1219 
 1220     This file is part of MayaChemTools.
 1221 
 1222     MayaChemTools is free software; you can redistribute it and/or modify it under
 1223     the terms of the GNU Lesser General Public License as published by the Free
 1224     Software Foundation; either version 3 of the License, or (at your option) any
 1225     later version.
 1226 
 1227 """
 1228 
 1229 if __name__ == "__main__":
 1230     main()