1 #!/bin/env python
2 #
3 # File: RDKitPerformPositionalAnalogueScan.py
4 # Author: Manish Sud <msud@san.rr.com>
5 #
6 # Copyright (C) 2026 Manish Sud. All rights reserved.
7 #
8 # The functionality available in this script is implemented using RDKit, an
9 # open source toolkit for cheminformatics developed by Greg Landrum.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 from __future__ import print_function
30
31 import os
32 import sys
33 import time
34 import re
35 import multiprocessing as mp
36 from itertools import combinations
37 from itertools import permutations
38
39 # RDKit imports...
40 try:
41 from rdkit import rdBase
42 from rdkit import Chem
43 from rdkit.Chem import AllChem
44 except ImportError as ErrMsg:
45 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
46 sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
47 sys.exit(1)
48
49 # MayaChemTools imports...
50 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
51 try:
52 from docopt import docopt
53 import MiscUtil
54 import RDKitUtil
55 except ImportError as ErrMsg:
56 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
57 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
58 sys.exit(1)
59
60 ScriptName = os.path.basename(sys.argv[0])
61 Options = {}
62 OptionsInfo = {}
63
64
65 def main():
66 """Start execution of the script."""
67
68 MiscUtil.PrintInfo(
69 "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n"
70 % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())
71 )
72
73 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
74
75 # Retrieve command line arguments and options...
76 RetrieveOptions()
77
78 # Process and validate command line arguments and options...
79 ProcessOptions()
80
81 # Perform actions required by the script...
82 PerformPositionalAnalogueScan()
83
84 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
85 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
86
87
88 def PerformPositionalAnalogueScan():
89 """Perform positional analogue scan."""
90
91 # Setup a molecule reader for input file...
92 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
93 OptionsInfo["InfileParams"]["AllowEmptyMols"] = True
94 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
95
96 # Set up a molecule writer...
97 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
98 if Writer is None:
99 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
100 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
101
102 MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount = (
103 ProcessMolecules(Mols, Writer)
104 )
105
106 if Writer is not None:
107 Writer.close()
108
109 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
110 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
111 if not OptionsInfo["RxnSMARTSMode"]:
112 MiscUtil.PrintInfo("Number of molecules with missing search pattern: %d" % SearchPatternMissingCount)
113 MiscUtil.PrintInfo("Number of molecules failed during generation of analogues: %d" % AnaloguesGenerationFailedCount)
114 MiscUtil.PrintInfo(
115 "Number of ignored molecules: %d"
116 % (MolCount - ValidMolCount + SearchPatternMissingCount + AnaloguesGenerationFailedCount)
117 )
118
119 MiscUtil.PrintInfo("\nTotal number of analogues for valid molecules: %d" % AnaloguesCount)
120 MiscUtil.PrintInfo("Average number of analogues per valid molecule: %d" % (int(AnaloguesCount / (ValidMolCount))))
121
122
123 def ProcessMolecules(Mols, Writer):
124 """Process molecules to generate analogues."""
125
126 if OptionsInfo["MPMode"]:
127 return ProcessMoleculesUsingMultipleProcesses(Mols, Writer)
128 else:
129 return ProcessMoleculesUsingSingleProcess(Mols, Writer)
130
131
132 def ProcessMoleculesUsingSingleProcess(Mols, Writer):
133 """Process molecules to generate analogues using a single process."""
134
135 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
136 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
137
138 (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount) = [0] * 5
139
140 # Initialize search and target patterns info...
141 SetupSearchAndTargetPatternsInfo()
142
143 FirstMol = True
144 for Mol in Mols:
145 MolCount += 1
146
147 if Mol is None:
148 continue
149
150 if RDKitUtil.IsMolEmpty(Mol):
151 if not OptionsInfo["QuietMode"]:
152 MolName = RDKitUtil.GetMolName(Mol, MolCount)
153 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
154 continue
155 ValidMolCount += 1
156
157 MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus = GenerateMolAnalogues(Mol, MolCount)
158
159 if FirstMol:
160 FirstMol = False
161 if SetSMILESMolProps:
162 if Writer is not None:
163 RDKitUtil.SetWriterMolProps(Writer, Mol)
164
165 if not OptionsInfo["RxnSMARTSMode"]:
166 if SearchPatternMissingStatus:
167 SearchPatternMissingCount += 1
168 WriteMolecule(Writer, Mol, Compute2DCoords)
169 continue
170
171 if not AnaloguesGenerationStatus:
172 AnaloguesGenerationFailedCount += 1
173 WriteMolecule(Writer, Mol, Compute2DCoords)
174 continue
175
176 # Write out molecues and analogues...
177 WriteMolAnalogues(Writer, Mol, MolCount, MolAnalogues, Compute2DCoords)
178
179 AnaloguesCount += len(MolAnalogues)
180
181 return (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount)
182
183
184 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer):
185 """Process molecules to generate analogues using multiprocessing."""
186
187 MPParams = OptionsInfo["MPParams"]
188
189 # Setup data for initializing a worker process...
190 MiscUtil.PrintInfo("Encoding options info...")
191
192 InitializeWorkerProcessArgs = (
193 MiscUtil.ObjectToBase64EncodedString(Options),
194 MiscUtil.ObjectToBase64EncodedString(OptionsInfo),
195 )
196
197 # Setup a encoded mols data iterable for a worker process...
198 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
199
200 # Setup process pool along with data initialization for each process...
201 MiscUtil.PrintInfo(
202 "\nConfiguring multiprocessing using %s method..."
203 % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")
204 )
205 MiscUtil.PrintInfo(
206 "NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n"
207 % (
208 MPParams["NumProcesses"],
209 MPParams["InputDataMode"],
210 ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]),
211 )
212 )
213
214 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
215
216 # Start processing...
217 if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
218 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
219 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
220 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
221 else:
222 MiscUtil.PrintError(
223 'The value, %s, specified for "--inputDataMode" is not supported.' % (MPParams["InputDataMode"])
224 )
225
226 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
227 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
228
229 (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount) = [0] * 5
230 FirstMol = True
231 for Result in Results:
232 MolCount += 1
233 MolIndex, EncodedMol, EncodedMolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus = Result
234
235 if EncodedMol is None:
236 continue
237 ValidMolCount += 1
238
239 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
240
241 if FirstMol:
242 FirstMol = False
243 if SetSMILESMolProps:
244 if Writer is not None:
245 RDKitUtil.SetWriterMolProps(Writer, Mol)
246
247 if not OptionsInfo["RxnSMARTSMode"]:
248 if SearchPatternMissingStatus:
249 SearchPatternMissingCount += 1
250 WriteMolecule(Writer, Mol, Compute2DCoords)
251 continue
252
253 if not AnaloguesGenerationStatus:
254 AnaloguesGenerationFailedCount += 1
255 WriteMolecule(Writer, Mol, Compute2DCoords)
256 continue
257
258 MolAnalogues = [
259 RDKitUtil.MolFromBase64EncodedMolString(EncodedMolAnalogue) for EncodedMolAnalogue in EncodedMolAnalogues
260 ]
261
262 # Write out molecues and analogues...
263 WriteMolAnalogues(Writer, Mol, MolCount, MolAnalogues, Compute2DCoords)
264
265 AnaloguesCount += len(MolAnalogues)
266
267 return (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount)
268
269
270 def InitializeWorkerProcess(*EncodedArgs):
271 """Initialize data for a worker process."""
272
273 global Options, OptionsInfo
274
275 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
276
277 # Decode Options and OptionInfo...
278 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
279 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
280
281 # Initialize search and target patterns info...
282 SetupSearchAndTargetPatternsInfo()
283
284
285 def WorkerProcess(EncodedMolInfo):
286 """Process data for a worker process."""
287
288 MolIndex, EncodedMol = EncodedMolInfo
289
290 MolAnalogues = None
291 SearchPatternMissingStatus = False
292 AnaloguesGenerationStatus = True
293
294 if EncodedMol is None:
295 return [MolIndex, None, MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus]
296
297 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
298 if RDKitUtil.IsMolEmpty(Mol):
299 if not OptionsInfo["QuietMode"]:
300 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
301 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
302 return [MolIndex, None, MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus]
303
304 MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus = GenerateMolAnalogues(Mol, (MolIndex + 1))
305
306 EncodedMolAnalogues = None
307 if MolAnalogues is not None:
308 EncodedMolAnalogues = [
309 RDKitUtil.MolToBase64EncodedMolString(
310 MolAnalogue,
311 PropertyPickleFlags=Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.PrivateProps,
312 )
313 for MolAnalogue in MolAnalogues
314 ]
315
316 return [MolIndex, EncodedMol, EncodedMolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus]
317
318
319 def GenerateMolAnalogues(Mol, MolNum=None):
320 """Generate analogues."""
321
322 if OptionsInfo["AttachAtomsMode"] or OptionsInfo["ReplaceAtomsMode"] or OptionsInfo["AttachSMILESMode"]:
323 return GenerateMolAnaloguesForMultipleScanModes(Mol, MolNum)
324 elif OptionsInfo["RxnSMARTSMode"]:
325 return GenerateMolAnaloguesForRxnSMARTSMode(Mol, MolNum)
326 elif OptionsInfo["TandemScanMode"]:
327 return GenerateMolAnaloguesForTandemScanMode(Mol, MolNum)
328 else:
329 MiscUtil.PrintError(
330 'Failed to generate analogues. The mode value, %s, specified using "-m, --mode" is not supported.'
331 % (OptionsInfo["Mode"])
332 )
333
334
335 def GenerateMolAnaloguesForMultipleScanModes(Mol, MolNum=None):
336 """Generate analogues for the following scan modes: AttachAtom, ReplaceAtom
337 or AttachSMILES."""
338
339 MolAnalogues = None
340 SearchPatternMissingStatus = False
341 AnaloguesGenerationStatus = True
342
343 # Retrieve matched atoms...
344 MolMatchedAtomIndices = SetupMolMatchedAtomIndices(Mol, MolNum)
345 if MolMatchedAtomIndices is None:
346 SearchPatternMissingStatus = True
347 return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
348
349 MolAnalogues = []
350 MolAnaloguesSet = set()
351
352 TargetPatternInfo = OptionsInfo["TargetPatternInfo"]
353 for TargetPatternIndex, TargetPatternObject in enumerate(TargetPatternInfo["PatternObjects"]):
354 for AtomIndicesCombination in combinations(MolMatchedAtomIndices, OptionsInfo["ComboSearchAtoms"]):
355 MolAnalogue = Chem.RWMol(Mol)
356
357 for AtomIndex in AtomIndicesCombination:
358 if OptionsInfo["AttachAtomsMode"]:
359 AtomicNumber = TargetPatternObject
360 MolAnalogue = AttachAtom(
361 MolAnalogue, AtomIndex, AtomicNumber, BondOrder=Chem.rdchem.BondType.SINGLE
362 )
363 elif OptionsInfo["ReplaceAtomsMode"]:
364 AtomicNumber = TargetPatternObject
365 MolAnalogue = ReplaceAtom(MolAnalogue, AtomIndex, AtomicNumber)
366 elif OptionsInfo["AttachSMILESMode"]:
367 AttachAtomIndex = (
368 MolAnalogue.GetNumAtoms()
369 - 1
370 + TargetPatternInfo["AttachSMILESAtomIndexOffsets"][TargetPatternIndex]
371 )
372 SMILESMol = TargetPatternObject
373 MolAnalogue = AttachMol(
374 MolAnalogue, AtomIndex, SMILESMol, AttachAtomIndex, BondOrder=Chem.rdchem.BondType.SINGLE
375 )
376
377 if SanitizeMolAnalogue(MolAnalogue, MolNum):
378 TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues)
379
380 if len(MolAnalogues) == 0:
381 AnaloguesGenerationStatus = False
382
383 return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
384
385
386 def GenerateMolAnaloguesForRxnSMARTSMode(Mol, MolNum=None):
387 """Generate analogues for reaction SMARTS mode."""
388
389 SearchPatternMissingStatus = False
390 AnaloguesGenerationStatus = True
391
392 MolAnalogues = []
393 MolAnaloguesSet = set()
394
395 TargetPatternInfo = OptionsInfo["TargetPatternInfo"]
396 for Rxn in TargetPatternInfo["PatternObjects"]:
397 RxnProducts = Rxn.RunReactants((Mol,))
398
399 if not len(RxnProducts):
400 continue
401
402 for RxnProduct in RxnProducts:
403 MolAnalogue = RxnProduct[0]
404
405 if SanitizeMolAnalogue(MolAnalogue, MolNum):
406 TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues)
407
408 if len(MolAnalogues) == 0:
409 AnaloguesGenerationStatus = False
410
411 return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
412
413
414 def GenerateMolAnaloguesForTandemScanMode(Mol, MolNum=None):
415 """Generate analogues for tandem scan mode."""
416
417 MolAnalogues = None
418 SearchPatternMissingStatus = False
419 AnaloguesGenerationStatus = True
420
421 # Retrieve matched atoms...
422 MolMatchedAtomIndices = SetupMolMatchedAtomIndices(Mol, MolNum)
423 if MolMatchedAtomIndices is None:
424 SearchPatternMissingStatus = True
425 return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
426
427 TargetPatternTandemScanInfo = OptionsInfo["TargetPatternTandemScanInfo"]
428 NumOfTandemOperations = 0
429 if TargetPatternTandemScanInfo is not None and len(TargetPatternTandemScanInfo):
430 NumOfTandemOperations = len(TargetPatternTandemScanInfo[0]["Names"])
431
432 if len(MolMatchedAtomIndices) < NumOfTandemOperations:
433 if not OptionsInfo["QuietMode"]:
434 MolName = RDKitUtil.GetMolName(Mol, MolNum)
435 MiscUtil.PrintWarning(
436 "Number of matched search pattern atoms, %s, must be >= number of specified operations, %s, during TandemScan mode for molecule %s..."
437 % (len(MolMatchedAtomIndices), NumOfTandemOperations, MolName)
438 )
439 SearchPatternMissingStatus = True
440 return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
441
442 MolAnalogues = []
443 MolAnaloguesSet = set()
444 for AtomIndicesCombination in combinations(MolMatchedAtomIndices, NumOfTandemOperations):
445 AtomIndicesPermutations = []
446 if OptionsInfo["PermuteTandemScan"]:
447 for AtomIndicesPermutation in permutations(AtomIndicesCombination, NumOfTandemOperations):
448 AtomIndicesPermutations.append(list(AtomIndicesPermutation))
449 else:
450 AtomIndicesPermutations.append(list(AtomIndicesCombination))
451
452 for AtomIndicesPermutation in AtomIndicesPermutations:
453 for PatternInfo in TargetPatternTandemScanInfo:
454 MolAnalogue = Chem.RWMol(Mol)
455
456 for NameIndex, Name in enumerate(PatternInfo["Names"]):
457 AtomIndex = AtomIndicesPermutation[NameIndex]
458 PatternObject = PatternInfo["PatternObjects"][NameIndex]
459
460 if re.match("^AttachAtoms$", Name, re.I):
461 MolAnalogue = AttachAtom(
462 MolAnalogue, AtomIndex, PatternObject, BondOrder=Chem.rdchem.BondType.SINGLE
463 )
464 elif re.match("^ReplaceAtoms$", Name, re.I):
465 MolAnalogue = ReplaceAtom(MolAnalogue, AtomIndex, PatternObject)
466 elif re.match("^AttachSMILES$", Name, re.I):
467 AttachAtomIndex = (
468 MolAnalogue.GetNumAtoms() - 1 + PatternInfo["AttachSMILESAtomIndexOffsets"][NameIndex]
469 )
470 MolAnalogue = AttachMol(
471 MolAnalogue,
472 AtomIndex,
473 PatternObject,
474 AttachAtomIndex,
475 BondOrder=Chem.rdchem.BondType.SINGLE,
476 )
477 MolAnalogue = Chem.RWMol(MolAnalogue)
478
479 if not SanitizeMolAnalogue(MolAnalogue, MolNum):
480 continue
481
482 TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues)
483
484 if len(MolAnalogues) == 0:
485 AnaloguesGenerationStatus = False
486
487 return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus)
488
489
490 def TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues):
491 """Track analogues."""
492
493 if OptionsInfo["UniqueAnalogues"]:
494 AnalogueSMILES = Chem.MolToSmiles(Chem.RemoveHs(MolAnalogue))
495 if AnalogueSMILES not in MolAnaloguesSet:
496 MolAnaloguesSet.add(AnalogueSMILES)
497 MolAnalogues.append(MolAnalogue)
498 else:
499 MolAnalogues.append(MolAnalogue)
500
501
502 def ReplaceAtom(Mol, AtomIndex, AtomicNumber):
503 """Replace an existing atom with a new atom in a molecule."""
504
505 Atom = Mol.GetAtomWithIdx(AtomIndex)
506 Atom.SetAtomicNum(AtomicNumber)
507
508 return Mol
509
510
511 def AttachAtom(Mol, AtomIndex, AtomicNumber, BondOrder=Chem.rdchem.BondType.SINGLE):
512 """Attach atom to an existing atom in a molecule."""
513
514 NewAtomIndex = Mol.AddAtom(Chem.Atom(AtomicNumber))
515 Mol.AddBond(AtomIndex, NewAtomIndex, BondOrder)
516
517 return Mol
518
519
520 def AttachMol(Mol, AtomIndex, AttachMol, AttachAtomIndex, BondOrder=Chem.rdchem.BondType.SINGLE):
521 """Attach molecule to an exisiting molecule."""
522
523 NewMol = Chem.CombineMols(Mol, AttachMol)
524
525 NewMol = Chem.EditableMol(NewMol)
526 NewMol.AddBond(AtomIndex, AttachAtomIndex, order=BondOrder)
527 NewMol = NewMol.GetMol()
528
529 return NewMol
530
531
532 def SanitizeMolAnalogue(Mol, MolNum):
533 """Sanitize analogue and catch any errors."""
534
535 Status = True
536 try:
537 Chem.SanitizeMol(Mol)
538 except Exception as ErrMsg:
539 if not OptionsInfo["QuietMode"]:
540 MolName = RDKitUtil.GetMolName(Mol, MolNum)
541 MiscUtil.PrintInfo(
542 "\n%s\nFailed to sanitize analogue of molecule %s...\nIgnoring analogue..." % (ErrMsg, MolName)
543 )
544 Status = False
545
546 return Status
547
548
549 def SetupMolMatchedAtomIndices(Mol, MolNum):
550 """Setup a list of matched atom indices corresponding to first atom in search
551 SMARTS pattern for a molecule."""
552
553 # Retrieve first atom match for PAS...
554 MatchedAtomIndices = [AtomIndices[0] for AtomIndices in Mol.GetSubstructMatches(OptionsInfo["SearchPatternMol"])]
555 if len(MatchedAtomIndices) == 0:
556 if not OptionsInfo["QuietMode"]:
557 MolName = RDKitUtil.GetMolName(Mol, MolNum)
558 MiscUtil.PrintWarning("No search pattern match for molecule %s..." % MolName)
559 return None
560
561 return MatchedAtomIndices
562
563
564 def WriteMolAnalogues(Writer, Mol, MolNum, MolAnalogues, Compute2DCoords):
565 """Write molecule analogues."""
566
567 # Write out molecule...
568 WriteMolecule(Writer, Mol, Compute2DCoords)
569
570 if MolAnalogues is None:
571 return
572
573 # Write out analogues...
574 for Index, MolAnalogue in enumerate(MolAnalogues):
575 MolName = RDKitUtil.GetMolName(Mol, MolNum)
576 SetMolAnalogueName(MolAnalogue, MolName, (Index + 1))
577
578 WriteMolecule(Writer, MolAnalogue, Compute2DCoords)
579
580
581 def SetMolAnalogueName(MolAnalogue, MolName, AnalogueCount):
582 """Set analogue name."""
583
584 NameSuffix = OptionsInfo["NameSuffix"]
585 AnalogueName = "%s_%s%d" % (MolName, NameSuffix, AnalogueCount)
586 MolAnalogue.SetProp("_Name", AnalogueName)
587
588
589 def WriteMolecule(Writer, Mol, Compute2DCoords):
590 """Write out molecule."""
591
592 if Compute2DCoords:
593 AllChem.Compute2DCoords(Mol)
594
595 Writer.write(Mol)
596
597
598 def SetupSearchAndTargetPatternsInfo():
599 """Setup search and target patterns info."""
600
601 # Setup search pattern molecule...
602 SearchPattern = OptionsInfo["SearchPattern"]
603 SearchPatternMol = None
604 if SearchPattern is not None:
605 SearchPatternMol = Chem.MolFromSmarts(SearchPattern)
606 if SearchPatternMol is None:
607 MiscUtil.PrintError(
608 'Failed to create search pattern molecule. The torsion SMILES/SMARTS pattern, "%s", specified using "-s, --searchPattern" option is not valid.'
609 % (SearchPattern)
610 )
611 OptionsInfo["SearchPatternMol"] = SearchPatternMol
612
613 # Setup target pattern information...
614 Mode = OptionsInfo["Mode"]
615
616 TargetPatternInfo = OptionsInfo["TargetPatternInfo"]
617 if TargetPatternInfo is not None:
618 TargetPatternInfo["PatternObjects"] = []
619 TargetPatternInfo["AttachSMILESAtomIndexOffsets"] = []
620
621 if re.match("^(AttachAtoms|ReplaceAtoms)$", Mode, re.I):
622 for Pattern in TargetPatternInfo["Patterns"]:
623 TargetPatternInfo["PatternObjects"].append(Chem.GetPeriodicTable().GetAtomicNumber(Pattern))
624 TargetPatternInfo["AttachSMILESAtomIndexOffsets"].append(None)
625 elif re.match("^AttachSMILES$", Mode, re.I):
626 for Pattern in TargetPatternInfo["Patterns"]:
627 SMILESMol = Chem.MolFromSmiles(Pattern)
628 AttachAtomIndexOffset = GetAttachSMILESAtomIndexOffset(SMILESMol)
629
630 TargetPatternInfo["PatternObjects"].append(SMILESMol)
631 TargetPatternInfo["AttachSMILESAtomIndexOffsets"].append(AttachAtomIndexOffset)
632 elif re.match("^RxnSMARTS$", Mode, re.I):
633 for Pattern in TargetPatternInfo["Patterns"]:
634 TargetPatternInfo["PatternObjects"].append(AllChem.ReactionFromSmarts(Pattern))
635 TargetPatternInfo["AttachSMILESAtomIndexOffsets"].append(None)
636 elif re.match("^TandemScan$", Mode, re.I):
637 TargetPatternTandemScanInfo = OptionsInfo["TargetPatternTandemScanInfo"]
638 for PatternInfo in TargetPatternTandemScanInfo:
639 PatternInfo["PatternObjects"] = []
640 PatternInfo["AttachSMILESAtomIndexOffsets"] = []
641 for NameIndex, Name in enumerate(PatternInfo["Names"]):
642 Pattern = PatternInfo["Pattern"][NameIndex]
643 if re.match("^(AttachAtoms|ReplaceAtoms)$", Name, re.I):
644 PatternInfo["PatternObjects"].append(Chem.GetPeriodicTable().GetAtomicNumber(Pattern))
645 PatternInfo["AttachSMILESAtomIndexOffsets"].append(None)
646 elif re.match("^AttachSMILES$", Name, re.I):
647 SMILESMol = Chem.MolFromSmiles(Pattern)
648 AttachAtomIndexOffset = GetAttachSMILESAtomIndexOffset(SMILESMol)
649
650 PatternInfo["PatternObjects"].append(SMILESMol)
651 PatternInfo["AttachSMILESAtomIndexOffsets"].append(AttachAtomIndexOffset)
652
653
654 def GetAttachSMILESAtomIndexOffset(Mol):
655 """Get attach atom index offset and optionally clear atom map numbers."""
656
657 AttachAtomIndexOffset = 1
658
659 AtomMapIndices = RDKitUtil.GetAtomMapIndices(Mol)
660 if AtomMapIndices is not None and len(AtomMapIndices):
661 # Use the atom index corresponding to the lowest atom map number
662 # as the attachement atom. The AtomMapIndices list contains atom indices
663 # sorted by atom map numbers.
664
665 AttachAtomIndexOffset = AtomMapIndices[0] + 1
666
667 if OptionsInfo["ClearAtomMapNumbers"]:
668 RDKitUtil.ClearAtomMapNumbers(Mol)
669
670 return AttachAtomIndexOffset
671
672
673 def ProcessOptions():
674 """Process and validate command line arguments and options."""
675
676 MiscUtil.PrintInfo("Processing options...")
677
678 # Validate options...
679 ValidateOptions()
680
681 OptionsInfo["ComboSearchAtoms"] = int(Options["--comboSearchAtoms"])
682 OptionsInfo["ClearAtomMapNumbers"] = True if re.match("^yes$", Options["--clearAtomMapNumbers"], re.I) else False
683
684 OptionsInfo["Infile"] = Options["--infile"]
685 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters(
686 "--infileParams", Options["--infileParams"], Options["--infile"]
687 )
688
689 OptionsInfo["Outfile"] = Options["--outfile"]
690 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters(
691 "--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]
692 )
693 OptionsInfo["Overwrite"] = Options["--overwrite"]
694
695 OptionsInfo["Mode"] = Options["--mode"]
696 (AttachAtomsMode, ReplaceAtomsMode, AttachSMILESMode, RxnSMARTSMode, TandemScanMode) = [False] * 5
697 Mode = OptionsInfo["Mode"]
698 if re.match("^AttachAtoms$", Mode, re.I):
699 AttachAtomsMode = True
700 elif re.match("^ReplaceAtoms$", Mode, re.I):
701 ReplaceAtomsMode = True
702 elif re.match("^AttachSMILES$", Mode, re.I):
703 AttachSMILESMode = True
704 elif re.match("^RxnSMARTS$", Mode, re.I):
705 RxnSMARTSMode = True
706 elif re.match("^TandemScan$", Mode, re.I):
707 TandemScanMode = True
708 else:
709 MiscUtil.PrintError('The mode value , %s, specified using "-m, --mode" option is not valid' % (Mode))
710 OptionsInfo["AttachAtomsMode"] = AttachAtomsMode
711 OptionsInfo["ReplaceAtomsMode"] = ReplaceAtomsMode
712 OptionsInfo["AttachSMILESMode"] = AttachSMILESMode
713 OptionsInfo["RxnSMARTSMode"] = RxnSMARTSMode
714 OptionsInfo["TandemScanMode"] = TandemScanMode
715
716 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
717 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
718
719 OptionsInfo["NameSuffix"] = Options["--nameSuffix"]
720 OptionsInfo["PermuteTandemScan"] = True if re.match("^yes$", Options["--permuteTandemScan"], re.I) else False
721 OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False
722
723 OptionsInfo["UniqueAnalogues"] = True if re.match("^yes$", Options["--uniqueAnalogues"], re.I) else False
724
725 # Process and validate search pattern...
726 SearchPattern = re.sub(" ", "", Options["--searchPattern"])
727 if MiscUtil.IsEmpty(SearchPattern):
728 MiscUtil.PrintError('No value specified using "-s, --searchPattern" option.')
729 if re.match("^auto$", SearchPattern, re.I):
730 if re.match("^(AttachAtoms|ReplaceAtoms|AttachSMILES|TandemScan)$", Mode, re.I):
731 SearchPattern = "[cH]"
732 else:
733 SearchPattern = None
734 else:
735 if re.match("^RxnSMARTS$", Mode, re.I):
736 MiscUtil.PrintError(
737 'The specification of SMARTS search pattern, %s, using "-s, --searchPattern" option is not allowed during, %s, value of "-m, --mode".'
738 % (SearchPattern, Mode)
739 )
740
741 if SearchPattern is not None:
742 SearchPatternMol = Chem.MolFromSmarts(SearchPattern)
743 if SearchPatternMol is None:
744 MiscUtil.PrintError(
745 'Failed to create search pattern molecule. The torsion SMILES/SMARTS pattern, "%s", specified using "-s, --searchPattern" option is not valid.'
746 % (SearchPattern)
747 )
748 OptionsInfo["SearchPattern"] = SearchPattern
749
750 # Process and validate target pattern...
751 TargetPattern = Options["--targetPattern"]
752 if MiscUtil.IsEmpty(TargetPattern):
753 MiscUtil.PrintError('No value specified using "-t, --targetPattern" option.')
754 TargetPattern = TargetPattern.strip()
755
756 if re.match("^auto$", TargetPattern, re.I):
757 # Setup default target pattern...
758 if re.match("^AttachAtoms$", Mode, re.I):
759 TargetPattern = "F"
760 elif re.match("^ReplaceAtoms$", Mode, re.I):
761 TargetPattern = "N"
762 elif re.match("^AttachSMILES$", Mode, re.I):
763 TargetPattern = "C(F)(F)(F)"
764 elif re.match("^RxnSMARTS$", Mode, re.I):
765 TargetPattern = "[cH:1]>>[N:1]"
766 elif re.match("^TandemScan$", Mode, re.I):
767 TargetPattern = "ReplaceAtoms,N,AttachAtoms,F"
768 else:
769 MiscUtil.PrintError(
770 'Failed to setup default target pattern for mode, %s, specified using "-m, --mode"' % (Mode)
771 )
772
773 TargetPatternInfo = None
774 TargetPatternTandemScanInfo = None
775
776 if re.match("^(AttachAtoms|ReplaceAtoms)$", Mode, re.I):
777 TargetPatternInfo = InitializeTargetPatternInfo(TargetPattern)
778 for ElementSymbol in TargetPatternInfo["Patterns"]:
779 if not RDKitUtil.IsValidElementSymbol(ElementSymbol):
780 MiscUtil.PrintError(
781 'The target pattern, "%s", specified using "-t, --targetPattern" must be a valid atomic symbol during, %s, value of "-m, --mode" option.'
782 % (ElementSymbol, Mode)
783 )
784 elif re.match("^AttachSMILES$", Mode, re.I):
785 TargetPatternInfo = InitializeTargetPatternInfo(TargetPattern)
786 for SMILES in TargetPatternInfo["Patterns"]:
787 ValidateSMILESTargetPattern(SMILES, Mode)
788 elif re.match("^RxnSMARTS$", Mode, re.I):
789 TargetPatternInfo = InitializeTargetPatternInfo(TargetPattern)
790 for RxnPattern in TargetPatternInfo["Patterns"]:
791 try:
792 AllChem.ReactionFromSmarts(RxnPattern)
793 except Exception as ErrMsg:
794 MiscUtil.PrintError(
795 'The target pattern, "%s", specified using "-t, --targetPattern" must be a valid RxnSMARTS during, %s, value of "-m, --mode" option.\nErrMsg: %s'
796 % (RxnPattern, Mode, ErrMsg)
797 )
798 elif re.match("^TandemScan$", Mode, re.I):
799 TargetPatternTandemScanInfo = ProcessTandemScanTargetPattern(TargetPattern)
800
801 # Track target pattern info...
802 OptionsInfo["TargetPattern"] = TargetPattern
803 OptionsInfo["TargetPatternInfo"] = TargetPatternInfo
804 OptionsInfo["TargetPatternTandemScanInfo"] = TargetPatternTandemScanInfo
805
806
807 def InitializeTargetPatternInfo(TargetPattern):
808 """Initialize target pattern information."""
809
810 TargetPatternInfo = {}
811 TargetPatternInfo["Patterns"] = TargetPattern.split()
812
813 return TargetPatternInfo
814
815
816 def ProcessTandemScanTargetPattern(TargetPattern):
817 """Process target pattern for tandem scan."""
818
819 # Validate target pattern...
820 ValidatedTargetPatternInfo = ValidateTandemScanTargetPattern(TargetPattern)
821
822 # Expand target patterns to cover various combinations...
823 #
824 # For example: "ReplaceAtoms,N,Attach Atoms, F Cl" expands to
825 # "ReplaceAtoms,N,AttachAtoms,F" and "ReplaceAtoms,N,AttachAtoms,Cl"
826 #
827 ExpandedPatterns = None
828 for NameIndex, Name in enumerate(ValidatedTargetPatternInfo["Names"]):
829 Patterns = ValidatedTargetPatternInfo["Patterns"][NameIndex]
830
831 if ExpandedPatterns is None:
832 ExpandedPatterns = []
833 for Pattern in Patterns:
834 ExpandedPatterns.append([Name, Pattern])
835 continue
836
837 NewExpandedPatterns = []
838 for ExpandedPattern in ExpandedPatterns:
839 for Pattern in Patterns:
840 NewExpandedPattern = []
841 NewExpandedPattern.extend(ExpandedPattern)
842
843 NewExpandedPattern.append(Name)
844 NewExpandedPattern.append(Pattern)
845
846 NewExpandedPatterns.append(NewExpandedPattern)
847
848 ExpandedPatterns = NewExpandedPatterns
849
850 # Track target pattern info...
851 TargetPatternTandemScanInfo = []
852 for ExpandedPattern in ExpandedPatterns:
853 PatternInfo = {}
854 PatternInfo["Names"] = []
855 PatternInfo["Pattern"] = []
856
857 for Index in range(0, len(ExpandedPattern), 2):
858 Name = ExpandedPattern[Index].strip()
859 Pattern = ExpandedPattern[Index + 1].strip()
860 PatternInfo["Names"].append(Name)
861 PatternInfo["Pattern"].append(Pattern)
862
863 TargetPatternTandemScanInfo.append(PatternInfo)
864
865 return TargetPatternTandemScanInfo
866
867
868 def ValidateTandemScanTargetPattern(TargetPattern):
869 """Validate target pattern for tandem scan."""
870
871 ValidatedTargetPatternInfo = {}
872 ValidatedTargetPatternInfo["Names"] = []
873 ValidatedTargetPatternInfo["Patterns"] = []
874
875 Mode = OptionsInfo["Mode"]
876
877 TargetPatternWords = TargetPattern.split(",")
878 if len(TargetPatternWords) % 2:
879 MiscUtil.PrintError(
880 'The number of comma delimited paramater names and values, %d, specified using "-t --targetPattern" option must be an even number during, %s, value of "-m, --mode" option.'
881 % (len(TargetPatternWords), Mode)
882 )
883
884 ValidOperationNames = ["AttachAtoms", "ReplaceAtoms", "AttachSMILES"]
885 CanonicalOperationNamesMap = {}
886 for OperationName in ValidOperationNames:
887 CanonicalOperationNamesMap[OperationName.lower()] = OperationName
888
889 # Validate PAS operation name and value pairs...
890 for Index in range(0, len(TargetPatternWords), 2):
891 OperationName = TargetPatternWords[Index].strip()
892 OperationPattern = TargetPatternWords[Index + 1].strip()
893
894 CanonicalOperationName = OperationName.lower()
895 if CanonicalOperationName not in CanonicalOperationNamesMap:
896 MiscUtil.PrintError(
897 'The operation name, %s, specified using "-t, --targetPattern" is not a valid during, %s, value of "-m, --mode" option. Supported operation names: %s'
898 % (OperationName, Mode, " ".join(ValidOperationNames))
899 )
900
901 if not OperationPattern:
902 MiscUtil.PrintError(
903 'Empty target pattern specified using "-t, --targetPattern" must be a valid atomic symbol for operation, %s, during, %s, value of "-m, --mode" option.'
904 % (OperationName, Mode)
905 )
906
907 Name = CanonicalOperationNamesMap[CanonicalOperationName]
908 OperationPatterns = OperationPattern.split()
909 if re.match("^(AttachAtoms|ReplaceAtoms)$", OperationName, re.I):
910 for Pattern in OperationPatterns:
911 if not RDKitUtil.IsValidElementSymbol(Pattern):
912 MiscUtil.PrintError(
913 'The target pattern, "%s", specified using "-t, --targetPattern" must be a valid atomic symbol for operation, %s, during, %s, value of "-m, --mode" option.'
914 % (Pattern, OperationName, Mode)
915 )
916 elif re.match("^AttachSMILES$", OperationName, re.I):
917 for Pattern in OperationPatterns:
918 ValidateSMILESTargetPattern(Pattern, Mode, OperationName)
919
920 ValidatedTargetPatternInfo["Names"].append(Name)
921 ValidatedTargetPatternInfo["Patterns"].append(OperationPatterns)
922
923 return ValidatedTargetPatternInfo
924
925
926 def ValidateSMILESTargetPattern(TargetPattern, Mode, OperationName=None):
927 """Validate SMILES pattern."""
928
929 Mol = Chem.MolFromSmiles(TargetPattern)
930 if Mol is None:
931 if OperationName is None:
932 MiscUtil.PrintError(
933 'The target pattern, "%s", specified using "-t, --targetPattern" must be a valid SMILES during, %s, value of "-m, --mode" option.'
934 % (TargetPattern, Mode)
935 )
936 else:
937 MiscUtil.PrintError(
938 'The target pattern, "%s", specified using "-t, --targetPattern" must be a valid SMILES for operation, %s, during, %s, value of "-m, --mode" option.'
939 % (TargetPattern, OperationName, Mode)
940 )
941
942
943 def RetrieveOptions():
944 """Retrieve command line arguments and options."""
945
946 # Get options...
947 global Options
948 Options = docopt(_docoptUsage_)
949
950 # Set current working directory to the specified directory...
951 WorkingDir = Options["--workingdir"]
952 if WorkingDir:
953 os.chdir(WorkingDir)
954
955 # Handle examples option...
956 if "--examples" in Options and Options["--examples"]:
957 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
958 sys.exit(0)
959
960
961 def ValidateOptions():
962 """Validate option values."""
963
964 MiscUtil.ValidateOptionIntegerValue("-c, --comboSearchAtoms", Options["--comboSearchAtoms"], {">": 0})
965 MiscUtil.ValidateOptionTextValue("--clearAtomMapNumbers", Options["--clearAtomMapNumbers"], "yes no")
966
967 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
968 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi txt csv tsv")
969
970 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi txt csv tsv")
971 MiscUtil.ValidateOptionsOutputFileOverwrite(
972 "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]
973 )
974 MiscUtil.ValidateOptionsDistinctFileNames(
975 "-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]
976 )
977
978 MiscUtil.ValidateOptionTextValue(
979 "-m, --mode", Options["--mode"], "ReplaceAtoms AttachAtoms AttachSMILES RxnSMARTS TandemScan"
980 )
981
982 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
983 MiscUtil.ValidateOptionTextValue("-p, --permuteTandemScan", Options["--permuteTandemScan"], "yes no")
984 MiscUtil.ValidateOptionTextValue("-q, --quiet", Options["--quiet"], "yes no")
985
986 MiscUtil.ValidateOptionTextValue(" -u, --uniqueAnalogues", Options["--uniqueAnalogues"], "yes no")
987
988
989 # Setup a usage string for docopt...
990 _docoptUsage_ = """
991 RDKitPerformPositionalAnalogueScan.py - Positional analogue scanning.
992
993 Usage:
994 RDKitPerformPositionalAnalogueScan.py [--comboSearchAtoms <number>] [--clearAtomMapNumbers <yes or no>]
995 [--infileParams <Name,Value,...>] [--mode <ReplaceAtoms, AttachAtoms, AttachSMILES, RxnSMARTS, TandemScan>]
996 [--mp <yes or no>] [--mpParams <Name,Value,...>] [ --nameSuffix <text>] [ --outfileParams <Name,Value,...> ]
997 [--overwrite] [--permuteTandemScan <yes or no>] [--quiet <yes or no>] [--searchPattern <SMARTSPattern>]
998 [--targetPattern <ElementSymbol, SMILES, RxnSMARTS,...>] [--uniqueAnalogues <yes or no> ]
999 [-w <dir>] -i <infile> -o <outfile>
1000 RDKitPerformPositionalAnalogueScan.py -h | --help | -e | --examples
1001
1002 Description:
1003 Perform Positional Analogue Scanning (PAS) to generate analogues of molecules
1004 by applying chemical transformations to molecules [ Ref 147-148 ]. The chemical
1005 transformations are defined using SMARTS, element symbols, SMILES, and
1006 RxnSMARTS. Four different types of chemical transformations are available for
1007 for generating analogues of molecules: replace atoms, attach atoms, attach SMILES,
1008 and RxnSMARTS. Tandem positional analogue scanning may be performed by the
1009 concurrent application of multiple chemical transformations.
1010
1011 A SMARTS search pattern identifies atoms in molecules for attachment or
1012 replacement points during positional analogue scanning. It may retrieve multiple
1013 substructure matches in a molecule. The first matched atom in each substructure
1014 match comprises a set of attachment or replacement points.
1015
1016 A target pattern encompasses information regarding element symbol, SMILES, and
1017 reaction SMARTS for replacing and attaching atoms, attaching SMILES, and applying
1018 reaction SMARTS. In addition, multiple concurrent chemical transformations may
1019 be specified during tandem positional analogue scanning.
1020
1021 The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi,
1022 .csv, .tsv, .txt)
1023
1024 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi, .csv,
1025 .tsv, .txt)
1026
1027 Options:
1028 -c, --comboSearchAtoms <number> [default: 1]
1029 Number of concurrent search pattern match atoms to use as attachment or
1030 replacement points during positional analogue scanning in 'AttachAtoms',
1031 'AttachSMILES', and 'ReplaceAtoms' modes. This value is ignored during
1032 'RxnSMARTS' and 'TandemScan' values of '-m, --mode' option.
1033 --clearAtomMapNumbers <yes or no> [default: yes]
1034 Clear any atom map numbers in target SMILES specification for 'AttachSMILES'
1035 before generating analogues.
1036 -e, --examples
1037 Print examples.
1038 -h, --help
1039 Print this help message.
1040 -i, --infile <infile>
1041 Input file name.
1042 --infileParams <Name,Value,...> [default: auto]
1043 A comma delimited list of parameter name and value pairs for reading
1044 molecules from files. The supported parameter names for different file
1045 formats, along with their default values, are shown below:
1046
1047 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
1048 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
1049 smilesTitleLine,auto,sanitize,yes
1050
1051 Possible values for smilesDelimiter: space, comma or tab.
1052 -m, --mode <ReplaceAtoms, AttachAtoms,...> [default: ReplaceAtoms]
1053 Type of operations to perform during positional analogue scanning. The
1054 supported values, along with a brief description, are shown below:
1055
1056 Mode Description
1057 ReplaceAtoms Replace atoms with new atoms
1058 AttachAtoms Attach new atoms to atoms
1059 AttachSMILES Attach SMILES to atoms
1060 RxnSMARTS Run reaction SMARTS
1061 TandemScan Perform tandem scan by combining ReplaceAtoms,
1062 AttachAtoms, and AttachSMILES
1063
1064 The chemical transformations of input molecules is dependent on the
1065 values of '-s, --searchPattern' and '-t, --targetPattern' options. For
1066 example, nitrogen-walk or nitrogen scan is performed by '[cH]' and 'N'
1067 values for '-s, --searchPattern' and '-t, --targetPattern' options.
1068 --mp <yes or no> [default: no]
1069 Use multiprocessing.
1070
1071 By default, input data is retrieved in a lazy manner via mp.Pool.imap()
1072 function employing lazy RDKit data iterable. This allows processing of
1073 arbitrary large data sets without any additional requirements memory.
1074
1075 All input data may be optionally loaded into memory by mp.Pool.map()
1076 before starting worker processes in a process pool by setting the value
1077 of 'inputDataMode' to 'InMemory' in '--mpParams' option.
1078
1079 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
1080 data mode may adversely impact the performance. The '--mpParams' section
1081 provides additional information to tune the value of 'chunkSize'.
1082 --mpParams <Name,Value,...> [default: auto]
1083 A comma delimited list of parameter name and value pairs to configure
1084 multiprocessing.
1085
1086 The supported parameter names along with their default and possible
1087 values are shown below:
1088
1089 chunkSize, auto
1090 inputDataMode, Lazy [ Possible values: InMemory or Lazy ]
1091 numProcesses, auto [ Default: mp.cpu_count() ]
1092
1093 These parameters are used by the following functions to configure and
1094 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
1095 mp.Pool.imap().
1096
1097 The chunkSize determines chunks of input data passed to each worker
1098 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
1099 The default value of chunkSize is dependent on the value of 'inputDataMode'.
1100
1101 The mp.Pool.map() function, invoked during 'InMemory' input data mode,
1102 automatically converts RDKit data iterable into a list, loads all data into
1103 memory, and calculates the default chunkSize using the following method
1104 as shown in its code:
1105
1106 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
1107 if extra: chunkSize += 1
1108
1109 For example, the default chunkSize will be 7 for a pool of 4 worker processes
1110 and 100 data items.
1111
1112 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
1113 'lazy' RDKit data iterable to retrieve data as needed, without loading all the
1114 data into memory. Consequently, the size of input data is not known a priori.
1115 It's not possible to estimate an optimal value for the chunkSize. The default
1116 chunkSize is set to 1.
1117
1118 The default value for the chunkSize during 'Lazy' data mode may adversely
1119 impact the performance due to the overhead associated with exchanging
1120 small chunks of data. It is generally a good idea to explicitly set chunkSize to
1121 a larger value during 'Lazy' input data mode, based on the size of your input
1122 data and number of processes in the process pool.
1123
1124 The mp.Pool.map() function waits for all worker processes to process all
1125 the data and return the results. The mp.Pool.imap() function, however,
1126 returns the the results obtained from worker processes as soon as the
1127 results become available for specified chunks of data.
1128
1129 The order of data in the results returned by both mp.Pool.map() and
1130 mp.Pool.imap() functions always corresponds to the input data.
1131 -n, --nameSuffix <text> [default: Analogue]
1132 Name suffix for generating molecule names of analogues. Format of analogue
1133 names: <MolName>_<NameSuffix>_<MolNum>
1134 -o, --outfile <outfile>
1135 Output file name.
1136 --outfileParams <Name,Value,...> [default: auto]
1137 A comma delimited list of parameter name and value pairs for writing
1138 molecules to files. The supported parameter names for different file
1139 formats, along with their default values, are shown below:
1140
1141 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
1142 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
1143 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
1144
1145 --overwrite
1146 Overwrite existing files.
1147 -p, --permuteTandemScan <yes or no> [default: yes]
1148 Permute atom positions matched by SMARTS search pattern in a molecule
1149 to generate all possible analogues during tandem positional analogue
1150 scanning.
1151
1152 This option is only valid for 'TandemScan' value of '-m, --mode' option.
1153 -q, --quiet <yes or no> [default: no]
1154 Use quiet mode. The warning and information messages will not be printed.
1155 -s, --searchPattern <SMARTSPattern> [default: auto]
1156 SMARTS search pattern identifying atoms in molecules for attachment or
1157 replacement points during positional analogue scanning. The SMARTS search
1158 pattern may retrieve multiple substructure matches in a molecule. The first
1159 matched atom in each substructure match comprises a set of attachment or
1160 replacement points.
1161
1162 The default values, dependent on the value of '-m, --mode' option, are
1163 shown below:
1164
1165 Mode Default Description
1166 ReplaceAtoms [cH] Aromatic carbon
1167 AttachAtoms [cH] Aromatic carbon
1168 AttachSMILES [cH] Aromatic carbon
1169 RxnSMARTS None Not applicable
1170 TandemScan [cH] Aromatic carbon
1171
1172 This option is ignored during 'RxnSMARTS' value of '-m, --mode' option.
1173 -t, --targetPattern <ElementSymbol, SMILES, RxnSMARTS...> [default: auto]
1174 Target pattern for performing chemical transformations during positional
1175 analogue scanning. These values are used in conjunction with the value of
1176 '-s, --searchPattern' to generate appropriate analogues.
1177
1178 The default values, dependent on the values of '-m, --mode' and
1179 '-s, --searchPattern' options, are shown below:
1180
1181 Mode Default Description
1182 ReplaceAtoms N Element symbol for nitrogen
1183 AttachAtoms F Element symbol for fluorine
1184 AttachSMILES C(F)(F)(F) SMILES for CF3
1185 RxnSMARTS [cH:1]>>[N:1] Replace aromatic carbon by nitrogen
1186 TandemScan ReplaceAtoms,N,AttachAtoms,F Replace and attach
1187
1188 The target specifications may contain multiple space delimted values. For
1189 example:
1190
1191 Mode Example
1192 ReplaceAtoms "N P"
1193 AttachAtoms "F Cl Br"
1194 AttachSMILES "C(F)(F)(F) CO"
1195 RxnSMARTS "[cH:1]>>[N:1] [cH:1]>>[P:1]"
1196 TandemScan "ReplaceAtoms,N P,AttachAtoms,F Cl"
1197
1198 The target specification for 'AttachSMILES' may contain atom map numbers.
1199 The attachment point corresponds to the matched atom with the lowest
1200 atom map number. Otherwise, the first atom in SMILES target specification
1201 represents the attachment point. For example:
1202
1203 Mode Example
1204 AttachSMILES "C(C)(C)(C) CO"
1205 AttachSMILES "C([C:1])(C)(C) C(F)(F)(F)"
1206
1207 Multiple concurrent chemical transformations are allowed during 'TandemScan'. The
1208 target pattern specification for 'TandemScan' is a comma delimited list of operation
1209 type and target pattern. Format: OperationType,TargetPattern,...
1210
1211 The supported operation types and target pattern for 'TandemScan' are shown
1212 below:
1213
1214 ReplaceAtoms,<ElementSymbol>
1215 AttachAtoms,<ElementSymbol>
1216 AttachSMILES,<SMILES>
1217
1218 ReplaceAtoms,"<ElementSymbol> <ElementSymbol>..."
1219 AttachAtoms,"<ElementSymbol> <ElementSymbol>..."
1220 AttachSMILES,"<SMILES> <SMIELS>..."
1221
1222 For example:
1223
1224 ReplaceAtoms,N,AttachAtoms,F
1225 ReplaceAtoms,N,AttachAtoms,F,AttachSMILES,C(F)(F)(F)
1226
1227 "ReplaceAtoms,N,AttachAtoms,F Cl"
1228 "ReplaceAtoms,N P,AttachAtoms,F Cl"
1229 "ReplaceAtoms,N P,AttachAtoms,F Cl,AttachSMILES,C(F)(F)(F) CO"
1230
1231 The number of chemical transformations in 'TandemScan' must be less than or
1232 equal to the total number atoms matched by SMARTS search pattern in a molecule.
1233 Otherwise, it is not possible to perform a 'TandemScan'. The matched atom positions
1234 may be optionally permuted to generate all possible analogues during positional
1235 analogue scanning using '-p, --permuteTandemScan' option.
1236 -u, --uniqueAnalogues <yes or no> [default: yes]
1237 Keep only unique analogues of a molecule corresponding to unique SMILES
1238 strings. The duplicate SMILES string may be generated during PAS due to
1239 symmetric replacement or attachment points in molecules.
1240 -w, --workingdir <dir>
1241 Location of working directory which defaults to the current directory.
1242
1243 Examples:
1244 To perform a nitrogen-walk or nitrogen scan by replacing aromatic carbons by
1245 nitrogens in molecules in a SMILES file and write out a SMILES file, type:
1246
1247 % RDKitPerformPositionalAnalogueScan.py -i Sample.smi -o SampleOut.smi
1248
1249 To run the first example by explicitly specifying search and target patterns
1250 for replacing aromatic carbons by nitogens in molecules in a SD file and write
1251 out a SD file, type:
1252
1253 % RDKitPerformPositionalAnalogueScan.py -m ReplaceAtoms -s "[cH]"
1254 -t "N" -i Sample.sdf -o SampleOut.sdf
1255
1256 To run the previous example for replacing aromatic carbons by N and P, type:
1257
1258 % RDKitPerformPositionalAnalogueScan.py -m ReplaceAtoms -s "[cH]"
1259 -t "N P" -i Sample.sdf -o SampleOut.sdf
1260
1261 To run the first example in multiprocessing mode on all available CPUs
1262 without loading all data into memory and write out a SD file, type:
1263
1264 % RDKitPerformPositionalAnalogueScan.py -i Sample.smi -o SampleOut.sdf
1265 --mp yes
1266
1267 To run the previous example in multiprocessing mode on all available CPUs
1268 by loading all data into memory and write out a SD file, type:
1269
1270 % RDKitPerformPositionalAnalogueScan.py -i Sample.smi -o SampleOut.sdf
1271 --mp yes --mpParams "inputDataMode,InMemory"
1272
1273 To run the previous example in multiprocessing mode on specific number of
1274 CPUs and chunk size without loading all data into memory and write out a SD file,
1275 type:
1276
1277 % RDKitPerformPositionalAnalogueScan.py -i Sample.smi -o SampleOut.sdf
1278 --mpParams "inputDataMode,Lazy,numProcesses,4,chunkSize,8"
1279
1280 To perform positional analogue scanning by simultaneously attaching fluorines
1281 to two aromatic carbons in molecules in a SMILES file and write out a SD file,
1282 type:
1283
1284 % RDKitPerformPositionalAnalogueScan.py -m AttachAtoms -s "[cH]"
1285 -t "F" -c 2 -i Sample.smi -o SampleOut.sdf
1286
1287 To perform positional analogue scanning by attaching SMILES to aromatic
1288 carbons in molecules in a SMILES file and write out a SD file, type:
1289
1290 % RDKitPerformPositionalAnalogueScan.py -m AttachSMILES -s "[cH]"
1291 -t "F" -i Sample.smi -o SampleOut.sdf
1292
1293 To run the previous example for attaching F and CF3 to aromatic carbons,
1294 type:
1295
1296 % RDKitPerformPositionalAnalogueScan.py -m AttachSMILES -s "[cH]"
1297 -t "F C(F)(F)(F)" -i Sample.smi -o SampleOut.sdf
1298
1299 To run the previous example for attaching "C(C)(C)(C)" at a specific attachment
1300 point corresponding to the lowest atom map number, type:
1301
1302 % RDKitPerformPositionalAnalogueScan.py -m AttachSMILES -s "[cH]"
1303 -t "C([C:1])([C:2])(C)" -i Sample.smi -o SampleOut.sdf
1304
1305 To perform a nitrogen-walk or nitrogen scan by using reaction SMARTS to replace
1306 aromatic carbons by nitrogens in molecules in a SMILES file and write out a SMILES
1307 file, type:
1308
1309 % RDKitPerformPositionalAnalogueScan.py -m RxnSMARTS
1310 -t "[cH:1]>>[N:1]" -i Sample.smi -o SampleOut.smi
1311
1312 To perform a tandem positional analogue scan by concurrently applying multiple
1313 chemical transformations to aromatic carbons, permute all matched search
1314 atom positions during analogue generation, and write out a SD file, type:
1315
1316 % RDKitPerformPositionalAnalogueScan.py -m TandemScan -s "[cH]"
1317 -t "ReplaceAtoms,N,AttachAtoms,F,AttachSMILES,OC"
1318 -p yes -i Sample.smi -o SampleOut.smi
1319
1320 To run the previous example for attaching multiple atoms and SMILES to
1321 aromatic carbons, type:
1322
1323 % RDKitPerformPositionalAnalogueScan.py -m TandemScan -s "[cH]"
1324 -t "ReplaceAtoms,N,AttachAtoms,F Cl,AttachSMILES,C(F)(F)(F) OC"
1325 -p yes -i Sample.smi -o SampleOut.smi
1326
1327 To perform a nitrogen-walk or nitrogen scan by replacing aromatic carbons by
1328 nitrogens in molecules in a SMILES CSV fileS, MILES strings in column 1, name
1329 in column 2, and write out a SD file, type:
1330
1331 % RDKitPerformPositionalAnalogueScan.py -m ReplaceAtoms -s "[cH]"
1332 -t "N" --infileParams "smilesDelimiter,comma, smilesTitleLine,yes,
1333 smilesColumn,1,smilesNameColumn,2"
1334 -i SampleSMILES.csv -o SampleOut.sdf
1335
1336 Author:
1337 Manish Sud(msud@san.rr.com)
1338
1339 See also:
1340 RDKitConvertFileFormat.py, RDKitEnumerateCompoundLibrary.py,
1341 RDKitPerformTorsionScan.py
1342
1343 Copyright:
1344 Copyright (C) 2026 Manish Sud. All rights reserved.
1345
1346 The functionality available in this script is implemented using RDKit, an
1347 open source toolkit for cheminformatics developed by Greg Landrum.
1348
1349 This file is part of MayaChemTools.
1350
1351 MayaChemTools is free software; you can redistribute it and/or modify it under
1352 the terms of the GNU Lesser General Public License as published by the Free
1353 Software Foundation; either version 3 of the License, or (at your option) any
1354 later version.
1355
1356 """
1357
1358 if __name__ == "__main__":
1359 main()