MayaChemTools

    1 #!/bin/env python
    2 #
    3 # File: RDKitPerformPositionalAnalogueScan.py
    4 # Author: Manish Sud <msud@san.rr.com>
    5 #
    6 # Copyright (C) 2026 Manish Sud. All rights reserved.
    7 #
    8 # The functionality available in this script is implemented using RDKit, an
    9 # open source toolkit for cheminformatics developed by Greg Landrum.
   10 #
   11 # This file is part of MayaChemTools.
   12 #
   13 # MayaChemTools is free software; you can redistribute it and/or modify it under
   14 # the terms of the GNU Lesser General Public License as published by the Free
   15 # Software Foundation; either version 3 of the License, or (at your option) any
   16 # later version.
   17 #
   18 # MayaChemTools is distributed in the hope that it will be useful, but without
   19 # any warranty; without even the implied warranty of merchantability of fitness
   20 # for a particular purpose.  See the GNU Lesser General Public License for more
   21 # details.
   22 #
   23 # You should have received a copy of the GNU Lesser General Public License
   24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
   25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
   26 # Boston, MA, 02111-1307, USA.
   27 #
   28 
   29 from __future__ import print_function
   30 
   31 import os
   32 import sys
   33 import time
   34 import re
   35 import multiprocessing as mp
   36 from itertools import combinations
   37 from itertools import permutations
   38 
   39 # RDKit imports...
   40 try:
   41     from rdkit import rdBase
   42     from rdkit import Chem
   43     from rdkit.Chem import AllChem
   44 except ImportError as ErrMsg:
   45     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
   46     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
   47     sys.exit(1)
   48 
   49 # MayaChemTools imports...
   50 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
   51 try:
   52     from docopt import docopt
   53     import MiscUtil
   54     import RDKitUtil
   55 except ImportError as ErrMsg:
   56     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
   57     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
   58     sys.exit(1)
   59 
   60 ScriptName = os.path.basename(sys.argv[0])
   61 Options = {}
   62 OptionsInfo = {}
   63 
   64 
   65 def main():
   66     """Start execution of the script."""
   67 
   68     MiscUtil.PrintInfo(
   69         "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n"
   70         % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())
   71     )
   72 
   73     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
   74 
   75     # Retrieve command line arguments and options...
   76     RetrieveOptions()
   77 
   78     # Process and validate command line arguments and options...
   79     ProcessOptions()
   80 
   81     # Perform actions required by the script...
   82     PerformPositionalAnalogueScan()
   83 
   84     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
   85     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
   86 
   87 
   88 def PerformPositionalAnalogueScan():
   89     """Perform positional analogue scan."""
   90 
   91     # Setup a molecule reader for input file...
   92     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
   93     OptionsInfo["InfileParams"]["AllowEmptyMols"] = True
   94     Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
   95 
   96     # Set up a molecule writer...
   97     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
   98     if Writer is None:
   99         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
  100     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
  101 
  102     MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount = (
  103         ProcessMolecules(Mols, Writer)
  104     )
  105 
  106     if Writer is not None:
  107         Writer.close()
  108 
  109     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
  110     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
  111     if not OptionsInfo["RxnSMARTSMode"]:
  112         MiscUtil.PrintInfo("Number of molecules with missing search pattern: %d" % SearchPatternMissingCount)
  113     MiscUtil.PrintInfo("Number of molecules failed during generation of analogues: %d" % AnaloguesGenerationFailedCount)
  114     MiscUtil.PrintInfo(
  115         "Number of ignored molecules: %d"
  116         % (MolCount - ValidMolCount + SearchPatternMissingCount + AnaloguesGenerationFailedCount)
  117     )
  118 
  119     MiscUtil.PrintInfo("\nTotal number of analogues for valid molecules: %d" % AnaloguesCount)
  120     MiscUtil.PrintInfo("Average number of analogues per valid molecule: %d" % (int(AnaloguesCount / (ValidMolCount))))
  121 
  122 
  123 def ProcessMolecules(Mols, Writer):
  124     """Process molecules to generate analogues."""
  125 
  126     if OptionsInfo["MPMode"]:
  127         return ProcessMoleculesUsingMultipleProcesses(Mols, Writer)
  128     else:
  129         return ProcessMoleculesUsingSingleProcess(Mols, Writer)
  130 
  131 
  132 def ProcessMoleculesUsingSingleProcess(Mols, Writer):
  133     """Process molecules to generate analogues using a single process."""
  134 
  135     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
  136     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
  137 
  138     (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount) = [0] * 5
  139 
  140     # Initialize search and target patterns info...
  141     SetupSearchAndTargetPatternsInfo()
  142 
  143     FirstMol = True
  144     for Mol in Mols:
  145         MolCount += 1
  146 
  147         if Mol is None:
  148             continue
  149 
  150         if RDKitUtil.IsMolEmpty(Mol):
  151             if not OptionsInfo["QuietMode"]:
  152                 MolName = RDKitUtil.GetMolName(Mol, MolCount)
  153                 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
  154             continue
  155         ValidMolCount += 1
  156 
  157         MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus = GenerateMolAnalogues(Mol, MolCount)
  158 
  159         if FirstMol:
  160             FirstMol = False
  161             if SetSMILESMolProps:
  162                 if Writer is not None:
  163                     RDKitUtil.SetWriterMolProps(Writer, Mol)
  164 
  165         if not OptionsInfo["RxnSMARTSMode"]:
  166             if SearchPatternMissingStatus:
  167                 SearchPatternMissingCount += 1
  168                 WriteMolecule(Writer, Mol, Compute2DCoords)
  169                 continue
  170 
  171         if not AnaloguesGenerationStatus:
  172             AnaloguesGenerationFailedCount += 1
  173             WriteMolecule(Writer, Mol, Compute2DCoords)
  174             continue
  175 
  176         # Write out molecues and analogues...
  177         WriteMolAnalogues(Writer, Mol, MolCount, MolAnalogues, Compute2DCoords)
  178 
  179         AnaloguesCount += len(MolAnalogues)
  180 
  181     return (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount)
  182 
  183 
  184 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer):
  185     """Process molecules to generate analogues using multiprocessing."""
  186 
  187     MPParams = OptionsInfo["MPParams"]
  188 
  189     # Setup data for initializing a worker process...
  190     MiscUtil.PrintInfo("Encoding options info...")
  191 
  192     InitializeWorkerProcessArgs = (
  193         MiscUtil.ObjectToBase64EncodedString(Options),
  194         MiscUtil.ObjectToBase64EncodedString(OptionsInfo),
  195     )
  196 
  197     # Setup a encoded mols data iterable for a worker process...
  198     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
  199 
  200     # Setup process pool along with data initialization for each process...
  201     MiscUtil.PrintInfo(
  202         "\nConfiguring multiprocessing using %s method..."
  203         % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")
  204     )
  205     MiscUtil.PrintInfo(
  206         "NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n"
  207         % (
  208             MPParams["NumProcesses"],
  209             MPParams["InputDataMode"],
  210             ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]),
  211         )
  212     )
  213 
  214     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
  215 
  216     # Start processing...
  217     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
  218         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
  219     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
  220         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
  221     else:
  222         MiscUtil.PrintError(
  223             'The value, %s, specified for "--inputDataMode" is not supported.' % (MPParams["InputDataMode"])
  224         )
  225 
  226     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
  227     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
  228 
  229     (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount) = [0] * 5
  230     FirstMol = True
  231     for Result in Results:
  232         MolCount += 1
  233         MolIndex, EncodedMol, EncodedMolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus = Result
  234 
  235         if EncodedMol is None:
  236             continue
  237         ValidMolCount += 1
  238 
  239         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
  240 
  241         if FirstMol:
  242             FirstMol = False
  243             if SetSMILESMolProps:
  244                 if Writer is not None:
  245                     RDKitUtil.SetWriterMolProps(Writer, Mol)
  246 
  247         if not OptionsInfo["RxnSMARTSMode"]:
  248             if SearchPatternMissingStatus:
  249                 SearchPatternMissingCount += 1
  250                 WriteMolecule(Writer, Mol, Compute2DCoords)
  251                 continue
  252 
  253         if not AnaloguesGenerationStatus:
  254             AnaloguesGenerationFailedCount += 1
  255             WriteMolecule(Writer, Mol, Compute2DCoords)
  256             continue
  257 
  258         MolAnalogues = [
  259             RDKitUtil.MolFromBase64EncodedMolString(EncodedMolAnalogue) for EncodedMolAnalogue in EncodedMolAnalogues
  260         ]
  261 
  262         # Write out molecues and analogues...
  263         WriteMolAnalogues(Writer, Mol, MolCount, MolAnalogues, Compute2DCoords)
  264 
  265         AnaloguesCount += len(MolAnalogues)
  266 
  267     return (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount)
  268 
  269 
  270 def InitializeWorkerProcess(*EncodedArgs):
  271     """Initialize data for a worker process."""
  272 
  273     global Options, OptionsInfo
  274 
  275     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
  276 
  277     # Decode Options and OptionInfo...
  278     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
  279     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
  280 
  281     # Initialize search and target  patterns info...
  282     SetupSearchAndTargetPatternsInfo()
  283 
  284 
  285 def WorkerProcess(EncodedMolInfo):
  286     """Process data for a worker process."""
  287 
  288     MolIndex, EncodedMol = EncodedMolInfo
  289 
  290     MolAnalogues = None
  291     SearchPatternMissingStatus = False
  292     AnaloguesGenerationStatus = True
  293 
  294     if EncodedMol is None:
  295         return [MolIndex, None, MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus]
  296 
  297     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
  298     if RDKitUtil.IsMolEmpty(Mol):
  299         if not OptionsInfo["QuietMode"]:
  300             MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
  301             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
  302         return [MolIndex, None, MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus]
  303 
  304     MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus = GenerateMolAnalogues(Mol, (MolIndex + 1))
  305 
  306     EncodedMolAnalogues = None
  307     if MolAnalogues is not None:
  308         EncodedMolAnalogues = [
  309             RDKitUtil.MolToBase64EncodedMolString(
  310                 MolAnalogue,
  311                 PropertyPickleFlags=Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.PrivateProps,
  312             )
  313             for MolAnalogue in MolAnalogues
  314         ]
  315 
  316     return [MolIndex, EncodedMol, EncodedMolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus]
  317 
  318 
  319 def GenerateMolAnalogues(Mol, MolNum=None):
  320     """Generate analogues."""
  321 
  322     if OptionsInfo["AttachAtomsMode"] or OptionsInfo["ReplaceAtomsMode"] or OptionsInfo["AttachSMILESMode"]:
  323         return GenerateMolAnaloguesForMultipleScanModes(Mol, MolNum)
  324     elif OptionsInfo["RxnSMARTSMode"]:
  325         return GenerateMolAnaloguesForRxnSMARTSMode(Mol, MolNum)
  326     elif OptionsInfo["TandemScanMode"]:
  327         return GenerateMolAnaloguesForTandemScanMode(Mol, MolNum)
  328     else:
  329         MiscUtil.PrintError(
  330             'Failed to generate analogues.  The mode value, %s, specified using "-m, --mode" is not supported.'
  331             % (OptionsInfo["Mode"])
  332         )
  333 
  334 
  335 def GenerateMolAnaloguesForMultipleScanModes(Mol, MolNum=None):
  336     """Generate analogues for the following scan modes: AttachAtom, ReplaceAtom
  337     or AttachSMILES."""
  338 
  339     MolAnalogues = None
  340     SearchPatternMissingStatus = False
  341     AnaloguesGenerationStatus = True
  342 
  343     # Retrieve matched atoms...
  344     MolMatchedAtomIndices = SetupMolMatchedAtomIndices(Mol, MolNum)
  345     if MolMatchedAtomIndices is None:
  346         SearchPatternMissingStatus = True
  347         return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
  348 
  349     MolAnalogues = []
  350     MolAnaloguesSet = set()
  351 
  352     TargetPatternInfo = OptionsInfo["TargetPatternInfo"]
  353     for TargetPatternIndex, TargetPatternObject in enumerate(TargetPatternInfo["PatternObjects"]):
  354         for AtomIndicesCombination in combinations(MolMatchedAtomIndices, OptionsInfo["ComboSearchAtoms"]):
  355             MolAnalogue = Chem.RWMol(Mol)
  356 
  357             for AtomIndex in AtomIndicesCombination:
  358                 if OptionsInfo["AttachAtomsMode"]:
  359                     AtomicNumber = TargetPatternObject
  360                     MolAnalogue = AttachAtom(
  361                         MolAnalogue, AtomIndex, AtomicNumber, BondOrder=Chem.rdchem.BondType.SINGLE
  362                     )
  363                 elif OptionsInfo["ReplaceAtomsMode"]:
  364                     AtomicNumber = TargetPatternObject
  365                     MolAnalogue = ReplaceAtom(MolAnalogue, AtomIndex, AtomicNumber)
  366                 elif OptionsInfo["AttachSMILESMode"]:
  367                     AttachAtomIndex = (
  368                         MolAnalogue.GetNumAtoms()
  369                         - 1
  370                         + TargetPatternInfo["AttachSMILESAtomIndexOffsets"][TargetPatternIndex]
  371                     )
  372                     SMILESMol = TargetPatternObject
  373                     MolAnalogue = AttachMol(
  374                         MolAnalogue, AtomIndex, SMILESMol, AttachAtomIndex, BondOrder=Chem.rdchem.BondType.SINGLE
  375                     )
  376 
  377             if SanitizeMolAnalogue(MolAnalogue, MolNum):
  378                 TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues)
  379 
  380     if len(MolAnalogues) == 0:
  381         AnaloguesGenerationStatus = False
  382 
  383     return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
  384 
  385 
  386 def GenerateMolAnaloguesForRxnSMARTSMode(Mol, MolNum=None):
  387     """Generate analogues for reaction SMARTS mode."""
  388 
  389     SearchPatternMissingStatus = False
  390     AnaloguesGenerationStatus = True
  391 
  392     MolAnalogues = []
  393     MolAnaloguesSet = set()
  394 
  395     TargetPatternInfo = OptionsInfo["TargetPatternInfo"]
  396     for Rxn in TargetPatternInfo["PatternObjects"]:
  397         RxnProducts = Rxn.RunReactants((Mol,))
  398 
  399         if not len(RxnProducts):
  400             continue
  401 
  402         for RxnProduct in RxnProducts:
  403             MolAnalogue = RxnProduct[0]
  404 
  405             if SanitizeMolAnalogue(MolAnalogue, MolNum):
  406                 TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues)
  407 
  408     if len(MolAnalogues) == 0:
  409         AnaloguesGenerationStatus = False
  410 
  411     return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
  412 
  413 
  414 def GenerateMolAnaloguesForTandemScanMode(Mol, MolNum=None):
  415     """Generate analogues for tandem scan mode."""
  416 
  417     MolAnalogues = None
  418     SearchPatternMissingStatus = False
  419     AnaloguesGenerationStatus = True
  420 
  421     # Retrieve matched atoms...
  422     MolMatchedAtomIndices = SetupMolMatchedAtomIndices(Mol, MolNum)
  423     if MolMatchedAtomIndices is None:
  424         SearchPatternMissingStatus = True
  425         return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
  426 
  427     TargetPatternTandemScanInfo = OptionsInfo["TargetPatternTandemScanInfo"]
  428     NumOfTandemOperations = 0
  429     if TargetPatternTandemScanInfo is not None and len(TargetPatternTandemScanInfo):
  430         NumOfTandemOperations = len(TargetPatternTandemScanInfo[0]["Names"])
  431 
  432     if len(MolMatchedAtomIndices) < NumOfTandemOperations:
  433         if not OptionsInfo["QuietMode"]:
  434             MolName = RDKitUtil.GetMolName(Mol, MolNum)
  435             MiscUtil.PrintWarning(
  436                 "Number of matched search pattern atoms, %s, must be >= number of specified operations, %s, during TandemScan mode for molecule %s..."
  437                 % (len(MolMatchedAtomIndices), NumOfTandemOperations, MolName)
  438             )
  439         SearchPatternMissingStatus = True
  440         return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
  441 
  442     MolAnalogues = []
  443     MolAnaloguesSet = set()
  444     for AtomIndicesCombination in combinations(MolMatchedAtomIndices, NumOfTandemOperations):
  445         AtomIndicesPermutations = []
  446         if OptionsInfo["PermuteTandemScan"]:
  447             for AtomIndicesPermutation in permutations(AtomIndicesCombination, NumOfTandemOperations):
  448                 AtomIndicesPermutations.append(list(AtomIndicesPermutation))
  449         else:
  450             AtomIndicesPermutations.append(list(AtomIndicesCombination))
  451 
  452         for AtomIndicesPermutation in AtomIndicesPermutations:
  453             for PatternInfo in TargetPatternTandemScanInfo:
  454                 MolAnalogue = Chem.RWMol(Mol)
  455 
  456                 for NameIndex, Name in enumerate(PatternInfo["Names"]):
  457                     AtomIndex = AtomIndicesPermutation[NameIndex]
  458                     PatternObject = PatternInfo["PatternObjects"][NameIndex]
  459 
  460                     if re.match("^AttachAtoms$", Name, re.I):
  461                         MolAnalogue = AttachAtom(
  462                             MolAnalogue, AtomIndex, PatternObject, BondOrder=Chem.rdchem.BondType.SINGLE
  463                         )
  464                     elif re.match("^ReplaceAtoms$", Name, re.I):
  465                         MolAnalogue = ReplaceAtom(MolAnalogue, AtomIndex, PatternObject)
  466                     elif re.match("^AttachSMILES$", Name, re.I):
  467                         AttachAtomIndex = (
  468                             MolAnalogue.GetNumAtoms() - 1 + PatternInfo["AttachSMILESAtomIndexOffsets"][NameIndex]
  469                         )
  470                         MolAnalogue = AttachMol(
  471                             MolAnalogue,
  472                             AtomIndex,
  473                             PatternObject,
  474                             AttachAtomIndex,
  475                             BondOrder=Chem.rdchem.BondType.SINGLE,
  476                         )
  477                         MolAnalogue = Chem.RWMol(MolAnalogue)
  478 
  479                 if not SanitizeMolAnalogue(MolAnalogue, MolNum):
  480                     continue
  481 
  482                 TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues)
  483 
  484     if len(MolAnalogues) == 0:
  485         AnaloguesGenerationStatus = False
  486 
  487     return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
  488 
  489 
  490 def TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues):
  491     """Track analogues."""
  492 
  493     if OptionsInfo["UniqueAnalogues"]:
  494         AnalogueSMILES = Chem.MolToSmiles(Chem.RemoveHs(MolAnalogue))
  495         if AnalogueSMILES not in MolAnaloguesSet:
  496             MolAnaloguesSet.add(AnalogueSMILES)
  497             MolAnalogues.append(MolAnalogue)
  498     else:
  499         MolAnalogues.append(MolAnalogue)
  500 
  501 
  502 def ReplaceAtom(Mol, AtomIndex, AtomicNumber):
  503     """Replace an existing atom with a new atom in a molecule."""
  504 
  505     Atom = Mol.GetAtomWithIdx(AtomIndex)
  506     Atom.SetAtomicNum(AtomicNumber)
  507 
  508     return Mol
  509 
  510 
  511 def AttachAtom(Mol, AtomIndex, AtomicNumber, BondOrder=Chem.rdchem.BondType.SINGLE):
  512     """Attach atom to an existing atom in a molecule."""
  513 
  514     NewAtomIndex = Mol.AddAtom(Chem.Atom(AtomicNumber))
  515     Mol.AddBond(AtomIndex, NewAtomIndex, BondOrder)
  516 
  517     return Mol
  518 
  519 
  520 def AttachMol(Mol, AtomIndex, AttachMol, AttachAtomIndex, BondOrder=Chem.rdchem.BondType.SINGLE):
  521     """Attach molecule to an exisiting molecule."""
  522 
  523     NewMol = Chem.CombineMols(Mol, AttachMol)
  524 
  525     NewMol = Chem.EditableMol(NewMol)
  526     NewMol.AddBond(AtomIndex, AttachAtomIndex, order=BondOrder)
  527     NewMol = NewMol.GetMol()
  528 
  529     return NewMol
  530 
  531 
  532 def SanitizeMolAnalogue(Mol, MolNum):
  533     """Sanitize analogue and catch any errors."""
  534 
  535     Status = True
  536     try:
  537         Chem.SanitizeMol(Mol)
  538     except Exception as ErrMsg:
  539         if not OptionsInfo["QuietMode"]:
  540             MolName = RDKitUtil.GetMolName(Mol, MolNum)
  541             MiscUtil.PrintInfo(
  542                 "\n%s\nFailed to sanitize analogue of molecule %s...\nIgnoring analogue..." % (ErrMsg, MolName)
  543             )
  544         Status = False
  545 
  546     return Status
  547 
  548 
  549 def SetupMolMatchedAtomIndices(Mol, MolNum):
  550     """Setup a list of matched atom indices corresponding to first atom in search
  551     SMARTS pattern for a molecule."""
  552 
  553     # Retrieve first atom match for PAS...
  554     MatchedAtomIndices = [AtomIndices[0] for AtomIndices in Mol.GetSubstructMatches(OptionsInfo["SearchPatternMol"])]
  555     if len(MatchedAtomIndices) == 0:
  556         if not OptionsInfo["QuietMode"]:
  557             MolName = RDKitUtil.GetMolName(Mol, MolNum)
  558             MiscUtil.PrintWarning("No search pattern match for molecule %s..." % MolName)
  559         return None
  560 
  561     return MatchedAtomIndices
  562 
  563 
  564 def WriteMolAnalogues(Writer, Mol, MolNum, MolAnalogues, Compute2DCoords):
  565     """Write molecule analogues."""
  566 
  567     # Write out molecule...
  568     WriteMolecule(Writer, Mol, Compute2DCoords)
  569 
  570     if MolAnalogues is None:
  571         return
  572 
  573     # Write out analogues...
  574     for Index, MolAnalogue in enumerate(MolAnalogues):
  575         MolName = RDKitUtil.GetMolName(Mol, MolNum)
  576         SetMolAnalogueName(MolAnalogue, MolName, (Index + 1))
  577 
  578         WriteMolecule(Writer, MolAnalogue, Compute2DCoords)
  579 
  580 
  581 def SetMolAnalogueName(MolAnalogue, MolName, AnalogueCount):
  582     """Set analogue name."""
  583 
  584     NameSuffix = OptionsInfo["NameSuffix"]
  585     AnalogueName = "%s_%s%d" % (MolName, NameSuffix, AnalogueCount)
  586     MolAnalogue.SetProp("_Name", AnalogueName)
  587 
  588 
  589 def WriteMolecule(Writer, Mol, Compute2DCoords):
  590     """Write out molecule."""
  591 
  592     if Compute2DCoords:
  593         AllChem.Compute2DCoords(Mol)
  594 
  595     Writer.write(Mol)
  596 
  597 
  598 def SetupSearchAndTargetPatternsInfo():
  599     """Setup search and target patterns info."""
  600 
  601     # Setup search pattern molecule...
  602     SearchPattern = OptionsInfo["SearchPattern"]
  603     SearchPatternMol = None
  604     if SearchPattern is not None:
  605         SearchPatternMol = Chem.MolFromSmarts(SearchPattern)
  606         if SearchPatternMol is None:
  607             MiscUtil.PrintError(
  608                 'Failed to create search pattern molecule. The torsion SMILES/SMARTS pattern, "%s", specified using "-s, --searchPattern" option is not valid.'
  609                 % (SearchPattern)
  610             )
  611     OptionsInfo["SearchPatternMol"] = SearchPatternMol
  612 
  613     # Setup target pattern information...
  614     Mode = OptionsInfo["Mode"]
  615 
  616     TargetPatternInfo = OptionsInfo["TargetPatternInfo"]
  617     if TargetPatternInfo is not None:
  618         TargetPatternInfo["PatternObjects"] = []
  619         TargetPatternInfo["AttachSMILESAtomIndexOffsets"] = []
  620 
  621     if re.match("^(AttachAtoms|ReplaceAtoms)$", Mode, re.I):
  622         for Pattern in TargetPatternInfo["Patterns"]:
  623             TargetPatternInfo["PatternObjects"].append(Chem.GetPeriodicTable().GetAtomicNumber(Pattern))
  624             TargetPatternInfo["AttachSMILESAtomIndexOffsets"].append(None)
  625     elif re.match("^AttachSMILES$", Mode, re.I):
  626         for Pattern in TargetPatternInfo["Patterns"]:
  627             SMILESMol = Chem.MolFromSmiles(Pattern)
  628             AttachAtomIndexOffset = GetAttachSMILESAtomIndexOffset(SMILESMol)
  629 
  630             TargetPatternInfo["PatternObjects"].append(SMILESMol)
  631             TargetPatternInfo["AttachSMILESAtomIndexOffsets"].append(AttachAtomIndexOffset)
  632     elif re.match("^RxnSMARTS$", Mode, re.I):
  633         for Pattern in TargetPatternInfo["Patterns"]:
  634             TargetPatternInfo["PatternObjects"].append(AllChem.ReactionFromSmarts(Pattern))
  635             TargetPatternInfo["AttachSMILESAtomIndexOffsets"].append(None)
  636     elif re.match("^TandemScan$", Mode, re.I):
  637         TargetPatternTandemScanInfo = OptionsInfo["TargetPatternTandemScanInfo"]
  638         for PatternInfo in TargetPatternTandemScanInfo:
  639             PatternInfo["PatternObjects"] = []
  640             PatternInfo["AttachSMILESAtomIndexOffsets"] = []
  641             for NameIndex, Name in enumerate(PatternInfo["Names"]):
  642                 Pattern = PatternInfo["Pattern"][NameIndex]
  643                 if re.match("^(AttachAtoms|ReplaceAtoms)$", Name, re.I):
  644                     PatternInfo["PatternObjects"].append(Chem.GetPeriodicTable().GetAtomicNumber(Pattern))
  645                     PatternInfo["AttachSMILESAtomIndexOffsets"].append(None)
  646                 elif re.match("^AttachSMILES$", Name, re.I):
  647                     SMILESMol = Chem.MolFromSmiles(Pattern)
  648                     AttachAtomIndexOffset = GetAttachSMILESAtomIndexOffset(SMILESMol)
  649 
  650                     PatternInfo["PatternObjects"].append(SMILESMol)
  651                     PatternInfo["AttachSMILESAtomIndexOffsets"].append(AttachAtomIndexOffset)
  652 
  653 
  654 def GetAttachSMILESAtomIndexOffset(Mol):
  655     """Get attach atom index offset and optionally clear atom map numbers."""
  656 
  657     AttachAtomIndexOffset = 1
  658 
  659     AtomMapIndices = RDKitUtil.GetAtomMapIndices(Mol)
  660     if AtomMapIndices is not None and len(AtomMapIndices):
  661         # Use the atom index corresponding to the lowest atom map number
  662         # as the attachement atom. The AtomMapIndices list contains atom indices
  663         # sorted by atom map numbers.
  664 
  665         AttachAtomIndexOffset = AtomMapIndices[0] + 1
  666 
  667         if OptionsInfo["ClearAtomMapNumbers"]:
  668             RDKitUtil.ClearAtomMapNumbers(Mol)
  669 
  670     return AttachAtomIndexOffset
  671 
  672 
  673 def ProcessOptions():
  674     """Process and validate command line arguments and options."""
  675 
  676     MiscUtil.PrintInfo("Processing options...")
  677 
  678     # Validate options...
  679     ValidateOptions()
  680 
  681     OptionsInfo["ComboSearchAtoms"] = int(Options["--comboSearchAtoms"])
  682     OptionsInfo["ClearAtomMapNumbers"] = True if re.match("^yes$", Options["--clearAtomMapNumbers"], re.I) else False
  683 
  684     OptionsInfo["Infile"] = Options["--infile"]
  685     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters(
  686         "--infileParams", Options["--infileParams"], Options["--infile"]
  687     )
  688 
  689     OptionsInfo["Outfile"] = Options["--outfile"]
  690     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters(
  691         "--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]
  692     )
  693     OptionsInfo["Overwrite"] = Options["--overwrite"]
  694 
  695     OptionsInfo["Mode"] = Options["--mode"]
  696     (AttachAtomsMode, ReplaceAtomsMode, AttachSMILESMode, RxnSMARTSMode, TandemScanMode) = [False] * 5
  697     Mode = OptionsInfo["Mode"]
  698     if re.match("^AttachAtoms$", Mode, re.I):
  699         AttachAtomsMode = True
  700     elif re.match("^ReplaceAtoms$", Mode, re.I):
  701         ReplaceAtomsMode = True
  702     elif re.match("^AttachSMILES$", Mode, re.I):
  703         AttachSMILESMode = True
  704     elif re.match("^RxnSMARTS$", Mode, re.I):
  705         RxnSMARTSMode = True
  706     elif re.match("^TandemScan$", Mode, re.I):
  707         TandemScanMode = True
  708     else:
  709         MiscUtil.PrintError('The mode value , %s, specified using "-m, --mode" option is not valid' % (Mode))
  710     OptionsInfo["AttachAtomsMode"] = AttachAtomsMode
  711     OptionsInfo["ReplaceAtomsMode"] = ReplaceAtomsMode
  712     OptionsInfo["AttachSMILESMode"] = AttachSMILESMode
  713     OptionsInfo["RxnSMARTSMode"] = RxnSMARTSMode
  714     OptionsInfo["TandemScanMode"] = TandemScanMode
  715 
  716     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
  717     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
  718 
  719     OptionsInfo["NameSuffix"] = Options["--nameSuffix"]
  720     OptionsInfo["PermuteTandemScan"] = True if re.match("^yes$", Options["--permuteTandemScan"], re.I) else False
  721     OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False
  722 
  723     OptionsInfo["UniqueAnalogues"] = True if re.match("^yes$", Options["--uniqueAnalogues"], re.I) else False
  724 
  725     # Process and validate search pattern...
  726     SearchPattern = re.sub(" ", "", Options["--searchPattern"])
  727     if MiscUtil.IsEmpty(SearchPattern):
  728         MiscUtil.PrintError('No value specified using "-s, --searchPattern" option.')
  729     if re.match("^auto$", SearchPattern, re.I):
  730         if re.match("^(AttachAtoms|ReplaceAtoms|AttachSMILES|TandemScan)$", Mode, re.I):
  731             SearchPattern = "[cH]"
  732         else:
  733             SearchPattern = None
  734     else:
  735         if re.match("^RxnSMARTS$", Mode, re.I):
  736             MiscUtil.PrintError(
  737                 'The specification of SMARTS search pattern, %s,  using "-s, --searchPattern" option is not allowed during, %s, value of "-m, --mode".'
  738                 % (SearchPattern, Mode)
  739             )
  740 
  741     if SearchPattern is not None:
  742         SearchPatternMol = Chem.MolFromSmarts(SearchPattern)
  743         if SearchPatternMol is None:
  744             MiscUtil.PrintError(
  745                 'Failed to create search pattern molecule. The torsion SMILES/SMARTS pattern, "%s", specified using "-s, --searchPattern" option is not valid.'
  746                 % (SearchPattern)
  747             )
  748     OptionsInfo["SearchPattern"] = SearchPattern
  749 
  750     # Process and validate target pattern...
  751     TargetPattern = Options["--targetPattern"]
  752     if MiscUtil.IsEmpty(TargetPattern):
  753         MiscUtil.PrintError('No value specified using "-t, --targetPattern" option.')
  754     TargetPattern = TargetPattern.strip()
  755 
  756     if re.match("^auto$", TargetPattern, re.I):
  757         # Setup default target pattern...
  758         if re.match("^AttachAtoms$", Mode, re.I):
  759             TargetPattern = "F"
  760         elif re.match("^ReplaceAtoms$", Mode, re.I):
  761             TargetPattern = "N"
  762         elif re.match("^AttachSMILES$", Mode, re.I):
  763             TargetPattern = "C(F)(F)(F)"
  764         elif re.match("^RxnSMARTS$", Mode, re.I):
  765             TargetPattern = "[cH:1]>>[N:1]"
  766         elif re.match("^TandemScan$", Mode, re.I):
  767             TargetPattern = "ReplaceAtoms,N,AttachAtoms,F"
  768         else:
  769             MiscUtil.PrintError(
  770                 'Failed to setup default target pattern for mode, %s, specified using "-m, --mode"' % (Mode)
  771             )
  772 
  773     TargetPatternInfo = None
  774     TargetPatternTandemScanInfo = None
  775 
  776     if re.match("^(AttachAtoms|ReplaceAtoms)$", Mode, re.I):
  777         TargetPatternInfo = InitializeTargetPatternInfo(TargetPattern)
  778         for ElementSymbol in TargetPatternInfo["Patterns"]:
  779             if not RDKitUtil.IsValidElementSymbol(ElementSymbol):
  780                 MiscUtil.PrintError(
  781                     'The target pattern, "%s", specified using "-t, --targetPattern" must be a valid atomic symbol during, %s, value of "-m, --mode" option.'
  782                     % (ElementSymbol, Mode)
  783                 )
  784     elif re.match("^AttachSMILES$", Mode, re.I):
  785         TargetPatternInfo = InitializeTargetPatternInfo(TargetPattern)
  786         for SMILES in TargetPatternInfo["Patterns"]:
  787             ValidateSMILESTargetPattern(SMILES, Mode)
  788     elif re.match("^RxnSMARTS$", Mode, re.I):
  789         TargetPatternInfo = InitializeTargetPatternInfo(TargetPattern)
  790         for RxnPattern in TargetPatternInfo["Patterns"]:
  791             try:
  792                 AllChem.ReactionFromSmarts(RxnPattern)
  793             except Exception as ErrMsg:
  794                 MiscUtil.PrintError(
  795                     'The target pattern, "%s", specified using "-t, --targetPattern" must be a valid RxnSMARTS  during, %s, value of "-m, --mode" option.\nErrMsg: %s'
  796                     % (RxnPattern, Mode, ErrMsg)
  797                 )
  798     elif re.match("^TandemScan$", Mode, re.I):
  799         TargetPatternTandemScanInfo = ProcessTandemScanTargetPattern(TargetPattern)
  800 
  801     # Track target pattern info...
  802     OptionsInfo["TargetPattern"] = TargetPattern
  803     OptionsInfo["TargetPatternInfo"] = TargetPatternInfo
  804     OptionsInfo["TargetPatternTandemScanInfo"] = TargetPatternTandemScanInfo
  805 
  806 
  807 def InitializeTargetPatternInfo(TargetPattern):
  808     """Initialize target pattern information."""
  809 
  810     TargetPatternInfo = {}
  811     TargetPatternInfo["Patterns"] = TargetPattern.split()
  812 
  813     return TargetPatternInfo
  814 
  815 
  816 def ProcessTandemScanTargetPattern(TargetPattern):
  817     """Process target pattern for tandem scan."""
  818 
  819     # Validate target pattern...
  820     ValidatedTargetPatternInfo = ValidateTandemScanTargetPattern(TargetPattern)
  821 
  822     # Expand target patterns to cover various combinations...
  823     #
  824     # For example: "ReplaceAtoms,N,Attach Atoms, F Cl" expands to
  825     # "ReplaceAtoms,N,AttachAtoms,F" and "ReplaceAtoms,N,AttachAtoms,Cl"
  826     #
  827     ExpandedPatterns = None
  828     for NameIndex, Name in enumerate(ValidatedTargetPatternInfo["Names"]):
  829         Patterns = ValidatedTargetPatternInfo["Patterns"][NameIndex]
  830 
  831         if ExpandedPatterns is None:
  832             ExpandedPatterns = []
  833             for Pattern in Patterns:
  834                 ExpandedPatterns.append([Name, Pattern])
  835             continue
  836 
  837         NewExpandedPatterns = []
  838         for ExpandedPattern in ExpandedPatterns:
  839             for Pattern in Patterns:
  840                 NewExpandedPattern = []
  841                 NewExpandedPattern.extend(ExpandedPattern)
  842 
  843                 NewExpandedPattern.append(Name)
  844                 NewExpandedPattern.append(Pattern)
  845 
  846                 NewExpandedPatterns.append(NewExpandedPattern)
  847 
  848         ExpandedPatterns = NewExpandedPatterns
  849 
  850     # Track target pattern info...
  851     TargetPatternTandemScanInfo = []
  852     for ExpandedPattern in ExpandedPatterns:
  853         PatternInfo = {}
  854         PatternInfo["Names"] = []
  855         PatternInfo["Pattern"] = []
  856 
  857         for Index in range(0, len(ExpandedPattern), 2):
  858             Name = ExpandedPattern[Index].strip()
  859             Pattern = ExpandedPattern[Index + 1].strip()
  860             PatternInfo["Names"].append(Name)
  861             PatternInfo["Pattern"].append(Pattern)
  862 
  863         TargetPatternTandemScanInfo.append(PatternInfo)
  864 
  865     return TargetPatternTandemScanInfo
  866 
  867 
  868 def ValidateTandemScanTargetPattern(TargetPattern):
  869     """Validate target pattern for tandem scan."""
  870 
  871     ValidatedTargetPatternInfo = {}
  872     ValidatedTargetPatternInfo["Names"] = []
  873     ValidatedTargetPatternInfo["Patterns"] = []
  874 
  875     Mode = OptionsInfo["Mode"]
  876 
  877     TargetPatternWords = TargetPattern.split(",")
  878     if len(TargetPatternWords) % 2:
  879         MiscUtil.PrintError(
  880             'The number of comma delimited paramater names and values, %d, specified using "-t --targetPattern" option must be an even number during, %s, value of "-m, --mode" option.'
  881             % (len(TargetPatternWords), Mode)
  882         )
  883 
  884     ValidOperationNames = ["AttachAtoms", "ReplaceAtoms", "AttachSMILES"]
  885     CanonicalOperationNamesMap = {}
  886     for OperationName in ValidOperationNames:
  887         CanonicalOperationNamesMap[OperationName.lower()] = OperationName
  888 
  889     # Validate PAS operation name and value pairs...
  890     for Index in range(0, len(TargetPatternWords), 2):
  891         OperationName = TargetPatternWords[Index].strip()
  892         OperationPattern = TargetPatternWords[Index + 1].strip()
  893 
  894         CanonicalOperationName = OperationName.lower()
  895         if CanonicalOperationName not in CanonicalOperationNamesMap:
  896             MiscUtil.PrintError(
  897                 'The operation name, %s, specified using "-t, --targetPattern" is not a valid during, %s, value of "-m, --mode" option. Supported operation  names: %s'
  898                 % (OperationName, Mode, " ".join(ValidOperationNames))
  899             )
  900 
  901         if not OperationPattern:
  902             MiscUtil.PrintError(
  903                 'Empty target pattern specified using "-t, --targetPattern" must be a valid atomic symbol for operation, %s, during, %s, value of "-m, --mode" option.'
  904                 % (OperationName, Mode)
  905             )
  906 
  907         Name = CanonicalOperationNamesMap[CanonicalOperationName]
  908         OperationPatterns = OperationPattern.split()
  909         if re.match("^(AttachAtoms|ReplaceAtoms)$", OperationName, re.I):
  910             for Pattern in OperationPatterns:
  911                 if not RDKitUtil.IsValidElementSymbol(Pattern):
  912                     MiscUtil.PrintError(
  913                         'The target pattern, "%s", specified using "-t, --targetPattern" must be a valid atomic symbol for operation, %s, during, %s, value of "-m, --mode" option.'
  914                         % (Pattern, OperationName, Mode)
  915                     )
  916         elif re.match("^AttachSMILES$", OperationName, re.I):
  917             for Pattern in OperationPatterns:
  918                 ValidateSMILESTargetPattern(Pattern, Mode, OperationName)
  919 
  920         ValidatedTargetPatternInfo["Names"].append(Name)
  921         ValidatedTargetPatternInfo["Patterns"].append(OperationPatterns)
  922 
  923     return ValidatedTargetPatternInfo
  924 
  925 
  926 def ValidateSMILESTargetPattern(TargetPattern, Mode, OperationName=None):
  927     """Validate SMILES pattern."""
  928 
  929     Mol = Chem.MolFromSmiles(TargetPattern)
  930     if Mol is None:
  931         if OperationName is None:
  932             MiscUtil.PrintError(
  933                 'The target pattern, "%s", specified using "-t, --targetPattern" must be a valid SMILES during, %s, value of "-m, --mode" option.'
  934                 % (TargetPattern, Mode)
  935             )
  936         else:
  937             MiscUtil.PrintError(
  938                 'The target pattern, "%s", specified using "-t, --targetPattern" must be a valid SMILES for operation, %s, during, %s, value of "-m, --mode" option.'
  939                 % (TargetPattern, OperationName, Mode)
  940             )
  941 
  942 
  943 def RetrieveOptions():
  944     """Retrieve command line arguments and options."""
  945 
  946     # Get options...
  947     global Options
  948     Options = docopt(_docoptUsage_)
  949 
  950     # Set current working directory to the specified directory...
  951     WorkingDir = Options["--workingdir"]
  952     if WorkingDir:
  953         os.chdir(WorkingDir)
  954 
  955     # Handle examples option...
  956     if "--examples" in Options and Options["--examples"]:
  957         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
  958         sys.exit(0)
  959 
  960 
  961 def ValidateOptions():
  962     """Validate option values."""
  963 
  964     MiscUtil.ValidateOptionIntegerValue("-c, --comboSearchAtoms", Options["--comboSearchAtoms"], {">": 0})
  965     MiscUtil.ValidateOptionTextValue("--clearAtomMapNumbers", Options["--clearAtomMapNumbers"], "yes no")
  966 
  967     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
  968     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi txt csv tsv")
  969 
  970     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi txt csv tsv")
  971     MiscUtil.ValidateOptionsOutputFileOverwrite(
  972         "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]
  973     )
  974     MiscUtil.ValidateOptionsDistinctFileNames(
  975         "-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]
  976     )
  977 
  978     MiscUtil.ValidateOptionTextValue(
  979         "-m, --mode", Options["--mode"], "ReplaceAtoms AttachAtoms AttachSMILES RxnSMARTS TandemScan"
  980     )
  981 
  982     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
  983     MiscUtil.ValidateOptionTextValue("-p, --permuteTandemScan", Options["--permuteTandemScan"], "yes no")
  984     MiscUtil.ValidateOptionTextValue("-q, --quiet", Options["--quiet"], "yes no")
  985 
  986     MiscUtil.ValidateOptionTextValue(" -u, --uniqueAnalogues", Options["--uniqueAnalogues"], "yes no")
  987 
  988 
  989 # Setup a usage string for docopt...
  990 _docoptUsage_ = """
  991 RDKitPerformPositionalAnalogueScan.py - Positional analogue scanning.
  992 
  993 Usage:
  994     RDKitPerformPositionalAnalogueScan.py [--comboSearchAtoms <number>] [--clearAtomMapNumbers <yes or no>]
  995                                           [--infileParams <Name,Value,...>] [--mode <ReplaceAtoms, AttachAtoms, AttachSMILES, RxnSMARTS, TandemScan>]
  996                                           [--mp <yes or no>] [--mpParams <Name,Value,...>] [ --nameSuffix <text>] [ --outfileParams <Name,Value,...> ]
  997                                           [--overwrite] [--permuteTandemScan <yes or no>] [--quiet <yes or no>] [--searchPattern <SMARTSPattern>]
  998                                           [--targetPattern <ElementSymbol, SMILES, RxnSMARTS,...>] [--uniqueAnalogues <yes or no> ]
  999                                           [-w <dir>] -i <infile>  -o <outfile> 
 1000     RDKitPerformPositionalAnalogueScan.py -h | --help | -e | --examples
 1001 
 1002 Description:
 1003     Perform Positional Analogue Scanning (PAS) to generate analogues of molecules
 1004     by applying chemical transformations to molecules [ Ref 147-148 ]. The chemical
 1005     transformations are defined using SMARTS, element symbols, SMILES, and 
 1006     RxnSMARTS. Four different types of chemical transformations are available for
 1007     for generating analogues of molecules: replace atoms, attach atoms, attach SMILES,
 1008     and RxnSMARTS. Tandem positional analogue scanning may be performed by the
 1009     concurrent application of multiple chemical transformations.
 1010     
 1011     A SMARTS search pattern identifies atoms in molecules for attachment or
 1012     replacement points during positional analogue scanning. It may retrieve multiple
 1013     substructure matches in a molecule. The first matched atom in each substructure
 1014     match comprises a set of attachment or replacement points.
 1015     
 1016     A target pattern encompasses information regarding element symbol, SMILES, and
 1017     reaction SMARTS for replacing and attaching atoms, attaching SMILES, and applying
 1018     reaction SMARTS. In addition, multiple concurrent chemical transformations may
 1019     be specified during tandem positional analogue scanning. 
 1020     
 1021     The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi,
 1022     .csv, .tsv, .txt)
 1023 
 1024     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi, .csv,
 1025     .tsv, .txt)
 1026 
 1027 Options:
 1028     -c, --comboSearchAtoms <number>  [default: 1]
 1029         Number of concurrent search pattern match atoms to use as attachment or
 1030         replacement points during positional analogue scanning in 'AttachAtoms', 
 1031         'AttachSMILES', and 'ReplaceAtoms' modes. This value is ignored during 
 1032         'RxnSMARTS' and 'TandemScan' values of  '-m, --mode' option.
 1033     --clearAtomMapNumbers <yes or no>  [default: yes]
 1034         Clear any atom map numbers in target SMILES specification for 'AttachSMILES'
 1035         before generating analogues.
 1036     -e, --examples
 1037         Print examples.
 1038     -h, --help
 1039         Print this help message.
 1040     -i, --infile <infile>
 1041         Input file name.
 1042     --infileParams <Name,Value,...>  [default: auto]
 1043         A comma delimited list of parameter name and value pairs for reading
 1044         molecules from files. The supported parameter names for different file
 1045         formats, along with their default values, are shown below:
 1046             
 1047             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 1048             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 1049                 smilesTitleLine,auto,sanitize,yes
 1050             
 1051         Possible values for smilesDelimiter: space, comma or tab.
 1052     -m, --mode <ReplaceAtoms, AttachAtoms,...>  [default: ReplaceAtoms]
 1053         Type of operations to perform during positional analogue scanning. The
 1054         supported values, along with a brief description, are shown below:
 1055             
 1056             Mode           Description
 1057             ReplaceAtoms   Replace atoms with new atoms
 1058             AttachAtoms    Attach new atoms to atoms
 1059             AttachSMILES   Attach SMILES to atoms
 1060             RxnSMARTS      Run reaction SMARTS
 1061             TandemScan     Perform tandem scan by combining ReplaceAtoms,
 1062                 AttachAtoms, and AttachSMILES                    
 1063              
 1064         The chemical transformations of input molecules is dependent on the
 1065         values of '-s, --searchPattern' and  '-t, --targetPattern' options. For
 1066         example, nitrogen-walk or nitrogen scan is performed by '[cH]' and 'N'
 1067         values for '-s, --searchPattern' and  '-t, --targetPattern' options.
 1068     --mp <yes or no>  [default: no]
 1069         Use multiprocessing.
 1070          
 1071         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 1072         function employing lazy RDKit data iterable. This allows processing of
 1073         arbitrary large data sets without any additional requirements memory.
 1074         
 1075         All input data may be optionally loaded into memory by mp.Pool.map()
 1076         before starting worker processes in a process pool by setting the value
 1077         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 1078         
 1079         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 1080         data mode may adversely impact the performance. The '--mpParams' section
 1081         provides additional information to tune the value of 'chunkSize'.
 1082     --mpParams <Name,Value,...>  [default: auto]
 1083         A comma delimited list of parameter name and value pairs to configure
 1084         multiprocessing.
 1085         
 1086         The supported parameter names along with their default and possible
 1087         values are shown below:
 1088         
 1089             chunkSize, auto
 1090             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 1091             numProcesses, auto   [ Default: mp.cpu_count() ]
 1092         
 1093         These parameters are used by the following functions to configure and
 1094         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 1095         mp.Pool.imap().
 1096         
 1097         The chunkSize determines chunks of input data passed to each worker
 1098         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 1099         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 1100         
 1101         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 1102         automatically converts RDKit data iterable into a list, loads all data into
 1103         memory, and calculates the default chunkSize using the following method
 1104         as shown in its code:
 1105         
 1106             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 1107             if extra: chunkSize += 1
 1108         
 1109         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 1110         and 100 data items.
 1111         
 1112         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 1113         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 1114         data into memory. Consequently, the size of input data is not known a priori.
 1115         It's not possible to estimate an optimal value for the chunkSize. The default 
 1116         chunkSize is set to 1.
 1117         
 1118         The default value for the chunkSize during 'Lazy' data mode may adversely
 1119         impact the performance due to the overhead associated with exchanging
 1120         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 1121         a larger value during 'Lazy' input data mode, based on the size of your input
 1122         data and number of processes in the process pool.
 1123         
 1124         The mp.Pool.map() function waits for all worker processes to process all
 1125         the data and return the results. The mp.Pool.imap() function, however,
 1126         returns the the results obtained from worker processes as soon as the
 1127         results become available for specified chunks of data.
 1128         
 1129         The order of data in the results returned by both mp.Pool.map() and 
 1130         mp.Pool.imap() functions always corresponds to the input data.
 1131     -n, --nameSuffix <text>  [default: Analogue]
 1132         Name suffix for generating molecule names of analogues. Format of analogue
 1133         names: <MolName>_<NameSuffix>_<MolNum>
 1134     -o, --outfile <outfile>
 1135         Output file name.
 1136     --outfileParams <Name,Value,...>  [default: auto]
 1137         A comma delimited list of parameter name and value pairs for writing
 1138         molecules to files. The supported parameter names for different file
 1139         formats, along with their default values, are shown below:
 1140             
 1141             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 1142             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 1143                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 1144             
 1145     --overwrite
 1146         Overwrite existing files.
 1147     -p, --permuteTandemScan <yes or no>  [default: yes]
 1148         Permute atom positions matched by SMARTS search pattern in a molecule
 1149         to generate all possible analogues during tandem positional analogue
 1150         scanning.
 1151         
 1152         This option is only valid for 'TandemScan' value of '-m, --mode' option.
 1153     -q, --quiet <yes or no>  [default: no]
 1154         Use quiet mode. The warning and information messages will not be printed.
 1155     -s, --searchPattern <SMARTSPattern>  [default: auto]
 1156         SMARTS search pattern identifying atoms in molecules for attachment or
 1157         replacement points during positional analogue scanning. The SMARTS search
 1158         pattern may retrieve multiple substructure matches in a molecule. The first
 1159         matched atom in each substructure match comprises a set of attachment or
 1160         replacement points.
 1161         
 1162         The default values, dependent on the value of '-m, --mode' option, are
 1163         shown below:
 1164             
 1165             Mode            Default   Description
 1166             ReplaceAtoms    [cH]      Aromatic carbon  
 1167             AttachAtoms     [cH]      Aromatic carbon
 1168             AttachSMILES    [cH]      Aromatic carbon
 1169             RxnSMARTS       None      Not applicable
 1170             TandemScan      [cH]      Aromatic carbon
 1171             
 1172         This option is ignored during 'RxnSMARTS' value of '-m, --mode' option.
 1173     -t, --targetPattern <ElementSymbol, SMILES, RxnSMARTS...>  [default: auto]
 1174         Target pattern for performing chemical transformations during positional
 1175         analogue scanning. These values are used in conjunction with the value of
 1176         '-s, --searchPattern' to generate appropriate analogues.
 1177         
 1178         The default values, dependent on the values of '-m, --mode' and 
 1179         '-s, --searchPattern' options, are shown below:
 1180             
 1181             Mode            Default        Description
 1182             ReplaceAtoms    N              Element symbol for nitrogen
 1183             AttachAtoms     F              Element symbol for fluorine
 1184             AttachSMILES    C(F)(F)(F)     SMILES for CF3
 1185             RxnSMARTS       [cH:1]>>[N:1] Replace aromatic carbon by nitrogen
 1186             TandemScan      ReplaceAtoms,N,AttachAtoms,F  Replace and attach
 1187             
 1188         The target specifications may contain multiple space delimted values. For
 1189         example:
 1190             
 1191             Mode            Example
 1192             ReplaceAtoms    "N P"
 1193             AttachAtoms     "F Cl Br"
 1194             AttachSMILES    "C(F)(F)(F) CO"
 1195             RxnSMARTS       "[cH:1]>>[N:1] [cH:1]>>[P:1]"
 1196             TandemScan      "ReplaceAtoms,N P,AttachAtoms,F Cl"
 1197             
 1198         The target specification for 'AttachSMILES' may contain atom map numbers.
 1199         The attachment point corresponds to the matched atom with the lowest
 1200         atom map number. Otherwise, the first atom in SMILES target specification
 1201         represents the attachment point. For example:
 1202             
 1203             Mode            Example
 1204             AttachSMILES    "C(C)(C)(C) CO"
 1205             AttachSMILES    "C([C:1])(C)(C) C(F)(F)(F)"
 1206             
 1207         Multiple concurrent chemical transformations are allowed during 'TandemScan'. The
 1208         target pattern specification for 'TandemScan' is a comma delimited list of operation
 1209         type and target pattern. Format: OperationType,TargetPattern,...
 1210         
 1211         The supported operation types and target pattern for 'TandemScan' are shown
 1212         below:
 1213             
 1214             ReplaceAtoms,<ElementSymbol>
 1215             AttachAtoms,<ElementSymbol>
 1216             AttachSMILES,<SMILES> 
 1217             
 1218             ReplaceAtoms,"<ElementSymbol> <ElementSymbol>..."
 1219             AttachAtoms,"<ElementSymbol> <ElementSymbol>..."
 1220             AttachSMILES,"<SMILES> <SMIELS>..."
 1221             
 1222         For example:
 1223             
 1224             ReplaceAtoms,N,AttachAtoms,F
 1225             ReplaceAtoms,N,AttachAtoms,F,AttachSMILES,C(F)(F)(F) 
 1226             
 1227             "ReplaceAtoms,N,AttachAtoms,F Cl"
 1228             "ReplaceAtoms,N P,AttachAtoms,F Cl"
 1229             "ReplaceAtoms,N P,AttachAtoms,F Cl,AttachSMILES,C(F)(F)(F) CO" 
 1230                     
 1231         The number of chemical transformations  in 'TandemScan' must be less than or
 1232         equal to the total number atoms matched by SMARTS search pattern in a molecule.
 1233         Otherwise, it is not possible to perform a 'TandemScan'. The matched atom positions
 1234         may be optionally permuted to generate all possible analogues during  positional
 1235         analogue scanning using '-p, --permuteTandemScan' option.
 1236     -u, --uniqueAnalogues <yes or no>  [default: yes]
 1237         Keep only unique analogues of a molecule corresponding to unique SMILES
 1238         strings. The duplicate SMILES string may be generated during PAS due to
 1239         symmetric replacement or attachment points in molecules.
 1240     -w, --workingdir <dir>
 1241         Location of working directory which defaults to the current directory.
 1242 
 1243 Examples:
 1244     To perform a nitrogen-walk or nitrogen scan by replacing aromatic carbons by
 1245     nitrogens in molecules in a SMILES file and write out a SMILES file, type:
 1246 
 1247         % RDKitPerformPositionalAnalogueScan.py  -i Sample.smi -o SampleOut.smi
 1248 
 1249     To run the first example by explicitly specifying search and target patterns
 1250     for replacing aromatic carbons by nitogens in molecules in a SD file and write
 1251     out a SD file, type:
 1252 
 1253         % RDKitPerformPositionalAnalogueScan.py  -m ReplaceAtoms -s "[cH]"
 1254           -t "N" -i Sample.sdf -o SampleOut.sdf
 1255 
 1256     To run the previous example for replacing aromatic carbons by N and P, type:
 1257 
 1258         % RDKitPerformPositionalAnalogueScan.py  -m ReplaceAtoms -s "[cH]"
 1259           -t "N P" -i Sample.sdf -o SampleOut.sdf
 1260 
 1261     To run the first example in multiprocessing mode on all available CPUs
 1262     without loading all data into memory and write out a SD file, type:
 1263 
 1264         % RDKitPerformPositionalAnalogueScan.py  -i Sample.smi -o SampleOut.sdf
 1265           --mp yes
 1266      
 1267     To run the previous example in multiprocessing mode on all available CPUs
 1268     by loading all data into memory and write out a SD file, type:
 1269 
 1270         % RDKitPerformPositionalAnalogueScan.py  -i Sample.smi -o SampleOut.sdf
 1271           --mp yes --mpParams "inputDataMode,InMemory"
 1272      
 1273     To run the previous example in multiprocessing mode on specific number of
 1274     CPUs and chunk size without loading all data into memory and write out a SD file,
 1275     type:
 1276 
 1277         % RDKitPerformPositionalAnalogueScan.py  -i Sample.smi -o SampleOut.sdf
 1278           --mpParams "inputDataMode,Lazy,numProcesses,4,chunkSize,8"
 1279      
 1280     To perform positional analogue scanning by simultaneously attaching fluorines
 1281     to two aromatic carbons in molecules in a SMILES file and write out a SD file,
 1282     type:
 1283 
 1284         % RDKitPerformPositionalAnalogueScan.py  -m AttachAtoms -s "[cH]"
 1285           -t "F" -c 2 -i Sample.smi -o SampleOut.sdf
 1286 
 1287     To perform positional analogue scanning by attaching SMILES to aromatic
 1288     carbons in molecules in a SMILES file and write out a SD file, type:
 1289 
 1290         % RDKitPerformPositionalAnalogueScan.py  -m AttachSMILES -s "[cH]"
 1291           -t "F" -i Sample.smi -o SampleOut.sdf
 1292 
 1293     To run the previous example for attaching F and CF3 to aromatic carbons,
 1294     type:
 1295 
 1296         % RDKitPerformPositionalAnalogueScan.py  -m AttachSMILES -s "[cH]"
 1297           -t "F C(F)(F)(F)" -i Sample.smi -o SampleOut.sdf
 1298 
 1299     To run the previous example for attaching "C(C)(C)(C)" at a specific attachment
 1300     point corresponding to the lowest atom map number,  type:
 1301 
 1302         % RDKitPerformPositionalAnalogueScan.py  -m AttachSMILES -s "[cH]"
 1303           -t "C([C:1])([C:2])(C)" -i Sample.smi -o SampleOut.sdf
 1304 
 1305     To perform a nitrogen-walk or nitrogen scan by using reaction SMARTS to replace
 1306     aromatic carbons by nitrogens in molecules in a SMILES file and write out a  SMILES
 1307     file,  type:
 1308 
 1309         % RDKitPerformPositionalAnalogueScan.py  -m RxnSMARTS
 1310           -t "[cH:1]>>[N:1]" -i Sample.smi -o SampleOut.smi
 1311 
 1312     To perform a tandem positional analogue scan by concurrently applying multiple
 1313     chemical transformations to aromatic carbons, permute all matched search
 1314     atom positions during analogue generation, and write out a SD file, type:
 1315 
 1316         % RDKitPerformPositionalAnalogueScan.py  -m TandemScan -s "[cH]"
 1317           -t "ReplaceAtoms,N,AttachAtoms,F,AttachSMILES,OC"
 1318           -p yes  -i Sample.smi -o SampleOut.smi
 1319 
 1320     To run the previous example for attaching multiple atoms and SMILES to
 1321     aromatic carbons, type:
 1322 
 1323         % RDKitPerformPositionalAnalogueScan.py  -m TandemScan -s "[cH]"
 1324           -t "ReplaceAtoms,N,AttachAtoms,F Cl,AttachSMILES,C(F)(F)(F) OC"
 1325           -p yes  -i Sample.smi -o SampleOut.smi
 1326 
 1327     To perform a nitrogen-walk or nitrogen scan by replacing aromatic carbons by
 1328     nitrogens in molecules in a SMILES CSV fileS, MILES strings in column 1, name
 1329     in column 2, and write out a SD file, type:
 1330 
 1331         % RDKitPerformPositionalAnalogueScan.py  -m ReplaceAtoms -s "[cH]"
 1332           -t "N" --infileParams "smilesDelimiter,comma, smilesTitleLine,yes,
 1333           smilesColumn,1,smilesNameColumn,2"
 1334           -i SampleSMILES.csv -o SampleOut.sdf
 1335 
 1336 Author:
 1337     Manish Sud(msud@san.rr.com)
 1338 
 1339 See also:
 1340     RDKitConvertFileFormat.py, RDKitEnumerateCompoundLibrary.py,
 1341     RDKitPerformTorsionScan.py
 1342 
 1343 Copyright:
 1344     Copyright (C) 2026 Manish Sud. All rights reserved.
 1345 
 1346     The functionality available in this script is implemented using RDKit, an
 1347     open source toolkit for cheminformatics developed by Greg Landrum.
 1348 
 1349     This file is part of MayaChemTools.
 1350 
 1351     MayaChemTools is free software; you can redistribute it and/or modify it under
 1352     the terms of the GNU Lesser General Public License as published by the Free
 1353     Software Foundation; either version 3 of the License, or (at your option) any
 1354     later version.
 1355 
 1356 """
 1357 
 1358 if __name__ == "__main__":
 1359     main()