1 #!/usr/bin/perl -w 2 # 3 # File: TopologicalAtomTripletsFingerprints.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use TextUtil; 34 use SDFileUtil; 35 use MoleculeFileIO; 36 use FileIO::FingerprintsSDFileIO; 37 use FileIO::FingerprintsTextFileIO; 38 use FileIO::FingerprintsFPFileIO; 39 use AtomTypes::AtomicInvariantsAtomTypes; 40 use AtomTypes::FunctionalClassAtomTypes; 41 use Fingerprints::TopologicalAtomTripletsFingerprints; 42 43 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 44 45 # Autoflush STDOUT 46 $| = 1; 47 48 # Starting message... 49 $ScriptName = basename($0); 50 print "\n$ScriptName: Starting...\n\n"; 51 $StartTime = new Benchmark; 52 53 # Get the options and setup script... 54 SetupScriptUsage(); 55 if ($Options{help} || @ARGV < 1) { 56 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 57 } 58 59 my(@SDFilesList); 60 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 61 62 # Process options... 63 print "Processing options...\n"; 64 my(%OptionsInfo); 65 ProcessOptions(); 66 67 # Setup information about input files... 68 print "Checking input SD file(s)...\n"; 69 my(%SDFilesInfo); 70 RetrieveSDFilesInfo(); 71 72 # Process input files.. 73 my($FileIndex); 74 if (@SDFilesList > 1) { 75 print "\nProcessing SD files...\n"; 76 } 77 for $FileIndex (0 .. $#SDFilesList) { 78 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 79 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 80 GenerateTopologicalAtomTripletsFingerprints($FileIndex); 81 } 82 } 83 print "\n$ScriptName:Done...\n\n"; 84 85 $EndTime = new Benchmark; 86 $TotalTime = timediff ($EndTime, $StartTime); 87 print "Total time: ", timestr($TotalTime), "\n"; 88 89 ############################################################################### 90 91 # Generate fingerprints for a SD file... 92 # 93 sub GenerateTopologicalAtomTripletsFingerprints { 94 my($FileIndex) = @_; 95 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 96 97 $SDFile = $SDFilesList[$FileIndex]; 98 99 # Setup output files... 100 # 101 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex); 102 103 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile); 104 $MoleculeFileIO->Open(); 105 106 $CmpdCount = 0; 107 $IgnoredCmpdCount = 0; 108 109 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) { 110 $CmpdCount++; 111 112 # Filter compound data before calculating fingerprints... 113 if ($OptionsInfo{Filter}) { 114 if (CheckAndFilterCompound($CmpdCount, $Molecule)) { 115 $IgnoredCmpdCount++; 116 next COMPOUND; 117 } 118 } 119 120 $TopologicalAtomTripletsFingerprints = GenerateMoleculeFingerprints($Molecule); 121 if (!$TopologicalAtomTripletsFingerprints) { 122 $IgnoredCmpdCount++; 123 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule); 124 next COMPOUND; 125 } 126 127 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 128 } 129 $MoleculeFileIO->Close(); 130 131 if ($NewFPSDFileIO) { 132 $NewFPSDFileIO->Close(); 133 } 134 if ($NewFPTextFileIO) { 135 $NewFPTextFileIO->Close(); 136 } 137 if ($NewFPFileIO) { 138 $NewFPFileIO->Close(); 139 } 140 141 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount); 142 } 143 144 # Process compound being ignored due to problems in fingerprints geneation... 145 # 146 sub ProcessIgnoredCompound { 147 my($Mode, $CmpdCount, $Molecule) = @_; 148 my($CmpdID, $DataFieldLabelAndValuesRef); 149 150 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 151 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 152 153 MODE: { 154 if ($Mode =~ /^ContainsNonElementalData$/i) { 155 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n"; 156 next MODE; 157 } 158 159 if ($Mode =~ /^ContainsNoElementalData$/i) { 160 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n"; 161 next MODE; 162 } 163 164 if ($Mode =~ /^FingerprintsGenerationFailed$/i) { 165 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 166 next MODE; 167 } 168 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 169 } 170 } 171 172 # Check and filter compounds.... 173 # 174 sub CheckAndFilterCompound { 175 my($CmpdCount, $Molecule) = @_; 176 my($ElementCount, $NonElementCount); 177 178 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements(); 179 180 if ($NonElementCount) { 181 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule); 182 return 1; 183 } 184 185 if (!$ElementCount) { 186 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule); 187 return 1; 188 } 189 190 return 0; 191 } 192 193 # Write out compounds fingerprints generation summary statistics... 194 # 195 sub WriteFingerprintsGenerationSummaryStatistics { 196 my($CmpdCount, $IgnoredCmpdCount) = @_; 197 my($ProcessedCmpdCount); 198 199 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount; 200 201 print "\nNumber of compounds: $CmpdCount\n"; 202 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n"; 203 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n"; 204 } 205 206 # Open output files... 207 # 208 sub SetupAndOpenOutputFiles { 209 my($FileIndex) = @_; 210 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams); 211 212 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3; 213 214 # Setup common parameters for fingerprints file IO objects... 215 # 216 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat}); 217 218 if ($OptionsInfo{SDOutput}) { 219 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex]; 220 print "Generating SD file $NewFPSDFile...\n"; 221 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel}); 222 $NewFPSDFileIO->Open(); 223 } 224 225 if ($OptionsInfo{FPOutput}) { 226 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex]; 227 print "Generating FP file $NewFPFile...\n"; 228 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams); 229 $NewFPFileIO->Open(); 230 } 231 232 if ($OptionsInfo{TextOutput}) { 233 my($ColLabelsRef); 234 235 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex]; 236 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex); 237 238 print "Generating text file $NewFPTextFile...\n"; 239 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote}); 240 $NewFPTextFileIO->Open(); 241 } 242 243 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 244 } 245 246 # Write fingerpritns and other data to appropriate output files... 247 # 248 sub WriteDataToOutputFiles { 249 my($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_; 250 my($DataFieldLabelAndValuesRef); 251 252 $DataFieldLabelAndValuesRef = undef; 253 if ($NewFPTextFileIO || $NewFPFileIO) { 254 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 255 } 256 257 if ($NewFPSDFileIO) { 258 my($CmpdString); 259 260 $CmpdString = $Molecule->GetInputMoleculeString(); 261 $NewFPSDFileIO->WriteFingerprints($TopologicalAtomTripletsFingerprints, $CmpdString); 262 } 263 264 if ($NewFPTextFileIO) { 265 my($ColValuesRef); 266 267 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 268 $NewFPTextFileIO->WriteFingerprints($TopologicalAtomTripletsFingerprints, $ColValuesRef); 269 } 270 271 if ($NewFPFileIO) { 272 my($CompoundID); 273 274 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 275 $NewFPFileIO->WriteFingerprints($TopologicalAtomTripletsFingerprints, $CompoundID); 276 } 277 } 278 279 # Generate approriate column labels for FPText output file... 280 # 281 sub SetupFPTextFileCoulmnLabels { 282 my($FileIndex) = @_; 283 my($Line, @ColLabels); 284 285 @ColLabels = (); 286 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 287 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 288 } 289 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 290 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 291 } 292 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 293 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}}; 294 } 295 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 296 push @ColLabels, $OptionsInfo{CompoundIDLabel}; 297 } 298 # Add fingerprints label... 299 push @ColLabels, $OptionsInfo{FingerprintsLabel}; 300 301 return \@ColLabels; 302 } 303 304 # Generate column values FPText output file.. 305 # 306 sub SetupFPTextFileCoulmnValues { 307 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 308 my(@ColValues); 309 310 @ColValues = (); 311 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 312 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 313 } 314 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 315 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 316 } 317 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 318 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 319 } 320 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 321 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}}; 322 } 323 324 return \@ColValues; 325 } 326 327 # Generate compound ID for FP and FPText output files.. 328 # 329 sub SetupCmpdIDForOutputFiles { 330 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 331 my($CmpdID); 332 333 $CmpdID = ''; 334 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { 335 my($MolName); 336 $MolName = $Molecule->GetName(); 337 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; 338 } 339 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { 340 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; 341 } 342 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { 343 my($SpecifiedDataField); 344 $SpecifiedDataField = $OptionsInfo{CompoundID}; 345 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; 346 } 347 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { 348 $CmpdID = $Molecule->GetName(); 349 } 350 return $CmpdID; 351 } 352 353 # Generate fingerprints for molecule... 354 # 355 sub GenerateMoleculeFingerprints { 356 my($Molecule) = @_; 357 my($TopologicalAtomTripletsFingerprints); 358 359 if ($OptionsInfo{KeepLargestComponent}) { 360 $Molecule->KeepLargestComponent(); 361 } 362 if (!$Molecule->DetectRings()) { 363 return undef; 364 } 365 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel}); 366 $Molecule->DetectAromaticity(); 367 368 $TopologicalAtomTripletsFingerprints = new Fingerprints::TopologicalAtomTripletsFingerprints('Molecule' => $Molecule, 'MinDistance' => $OptionsInfo{MinDistance}, 'MaxDistance' => $OptionsInfo{MaxDistance}, 'UseTriangleInequality' => $OptionsInfo{UseTriangleInequality}, 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType}); 369 SetAtomIdentifierTypeValuesToUse($TopologicalAtomTripletsFingerprints); 370 371 # Generate fingerprints... 372 $TopologicalAtomTripletsFingerprints->GenerateFingerprints(); 373 374 # Make sure fingerprints generation is successful... 375 if (!$TopologicalAtomTripletsFingerprints->IsFingerprintsGenerationSuccessful()) { 376 return undef; 377 } 378 379 return $TopologicalAtomTripletsFingerprints; 380 } 381 382 # Set atom identifier type to use for generating fingerprints... 383 # 384 sub SetAtomIdentifierTypeValuesToUse { 385 my($TopologicalAtomTripletsFingerprints) = @_; 386 387 if ($OptionsInfo{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) { 388 $TopologicalAtomTripletsFingerprints->SetAtomicInvariantsToUse(\@{$OptionsInfo{AtomicInvariantsToUse}}); 389 } 390 elsif ($OptionsInfo{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) { 391 $TopologicalAtomTripletsFingerprints->SetFunctionalClassesToUse(\@{$OptionsInfo{FunctionalClassesToUse}}); 392 } 393 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 394 # Nothing to do for now... 395 } 396 else { 397 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n"; 398 } 399 } 400 401 # Retrieve information about SD files... 402 # 403 sub RetrieveSDFilesInfo { 404 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef); 405 406 %SDFilesInfo = (); 407 @{$SDFilesInfo{FileOkay}} = (); 408 @{$SDFilesInfo{OutFileRoot}} = (); 409 @{$SDFilesInfo{SDOutFileNames}} = (); 410 @{$SDFilesInfo{FPOutFileNames}} = (); 411 @{$SDFilesInfo{TextOutFileNames}} = (); 412 @{$SDFilesInfo{AllDataFieldsRef}} = (); 413 @{$SDFilesInfo{CommonDataFieldsRef}} = (); 414 415 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0; 416 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0; 417 418 FILELIST: for $Index (0 .. $#SDFilesList) { 419 $SDFile = $SDFilesList[$Index]; 420 421 $SDFilesInfo{FileOkay}[$Index] = 0; 422 $SDFilesInfo{OutFileRoot}[$Index] = ''; 423 $SDFilesInfo{SDOutFileNames}[$Index] = ''; 424 $SDFilesInfo{FPOutFileNames}[$Index] = ''; 425 $SDFilesInfo{TextOutFileNames}[$Index] = ''; 426 427 $SDFile = $SDFilesList[$Index]; 428 if (!(-e $SDFile)) { 429 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 430 next FILELIST; 431 } 432 if (!CheckFileType($SDFile, "sd sdf")) { 433 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 434 next FILELIST; 435 } 436 437 if ($CheckDataField) { 438 # Make sure data field exists in SD file.. 439 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); 440 441 @CmpdLines = (); 442 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 443 $CmpdString = ReadCmpdString(\*SDFILE); 444 close SDFILE; 445 @CmpdLines = split "\n", $CmpdString; 446 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 447 $SpecifiedDataField = $OptionsInfo{CompoundID}; 448 if (!exists $DataFieldValues{$SpecifiedDataField}) { 449 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; 450 next FILELIST; 451 } 452 } 453 454 $AllDataFieldsRef = ''; 455 $CommonDataFieldsRef = ''; 456 if ($CollectDataFields) { 457 my($CmpdCount); 458 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 459 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 460 close SDFILE; 461 } 462 463 # Setup output file names... 464 $FileDir = ""; $FileName = ""; $FileExt = ""; 465 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 466 467 $TextOutFileExt = "csv"; 468 if ($Options{outdelim} =~ /^tab$/i) { 469 $TextOutFileExt = "tsv"; 470 } 471 $SDOutFileExt = $FileExt; 472 $FPOutFileExt = "fpf"; 473 474 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 475 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 476 if ($RootFileName && $RootFileExt) { 477 $FileName = $RootFileName; 478 } 479 else { 480 $FileName = $OptionsInfo{OutFileRoot}; 481 } 482 $OutFileRoot = $FileName; 483 } 484 else { 485 $OutFileRoot = "${FileName}TopologicalAtomTripletsFP"; 486 } 487 488 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}"; 489 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}"; 490 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}"; 491 492 if ($OptionsInfo{SDOutput}) { 493 if ($SDFile =~ /$NewSDFileName/i) { 494 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 495 print "Specify a different name using \"-r --root\" option or use default name.\n"; 496 next FILELIST; 497 } 498 } 499 500 if (!$OptionsInfo{OverwriteFiles}) { 501 # Check SD and text outout files... 502 if ($OptionsInfo{SDOutput}) { 503 if (-e $NewSDFileName) { 504 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n"; 505 next FILELIST; 506 } 507 } 508 if ($OptionsInfo{FPOutput}) { 509 if (-e $NewFPFileName) { 510 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n"; 511 next FILELIST; 512 } 513 } 514 if ($OptionsInfo{TextOutput}) { 515 if (-e $NewTextFileName) { 516 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n"; 517 next FILELIST; 518 } 519 } 520 } 521 522 $SDFilesInfo{FileOkay}[$Index] = 1; 523 524 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 525 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName; 526 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName; 527 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName; 528 529 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef; 530 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef; 531 } 532 } 533 534 # Process option values... 535 sub ProcessOptions { 536 %OptionsInfo = (); 537 538 ProcessAtomIdentifierTypeOptions(); 539 540 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel}; 541 542 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; 543 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel}; 544 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode}; 545 546 my(@SpecifiedDataFields); 547 @SpecifiedDataFields = (); 548 549 @{$OptionsInfo{SpecifiedDataFields}} = (); 550 $OptionsInfo{CompoundID} = ''; 551 552 if ($Options{datafieldsmode} =~ /^CompoundID$/i) { 553 if ($Options{compoundidmode} =~ /^DataField$/i) { 554 if (!$Options{compoundid}) { 555 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; 556 } 557 $OptionsInfo{CompoundID} = $Options{compoundid}; 558 } 559 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { 560 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; 561 } 562 } 563 elsif ($Options{datafieldsmode} =~ /^Specify$/i) { 564 if (!$Options{datafields}) { 565 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n"; 566 } 567 @SpecifiedDataFields = split /\,/, $Options{datafields}; 568 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields; 569 } 570 571 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0; 572 573 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalAtomTripletsFingerprints'; 574 575 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0; 576 577 $OptionsInfo{MinDistance} = $Options{mindistance}; 578 $OptionsInfo{MaxDistance} = $Options{maxdistance}; 579 580 $OptionsInfo{Output} = $Options{output}; 581 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0; 582 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0; 583 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0; 584 585 $OptionsInfo{OutDelim} = $Options{outdelim}; 586 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 587 588 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 589 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 590 591 $OptionsInfo{UseTriangleInequality} = ($Options{usetriangleinequality} =~ /^Yes$/i) ? 1 : 0; 592 593 $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat}; 594 } 595 596 # Process atom identifier type and related options... 597 # 598 sub ProcessAtomIdentifierTypeOptions { 599 600 $OptionsInfo{AtomIdentifierType} = $Options{atomidentifiertype}; 601 602 if ($Options{atomidentifiertype} =~ /^AtomicInvariantsAtomTypes$/i) { 603 ProcessAtomicInvariantsToUseOption(); 604 } 605 elsif ($Options{atomidentifiertype} =~ /^FunctionalClassAtomTypes$/i) { 606 ProcessFunctionalClassesToUse(); 607 } 608 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 609 # Nothing to do for now... 610 } 611 else { 612 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n"; 613 } 614 } 615 616 # Process specified atomic invariants to use... 617 # 618 sub ProcessAtomicInvariantsToUseOption { 619 my($AtomicInvariant, $AtomSymbolSpecified, @AtomicInvariantsWords); 620 621 @{$OptionsInfo{AtomicInvariantsToUse}} = (); 622 if (IsEmpty($Options{atomicinvariantstouse})) { 623 die "Error: Atomic invariants value specified using \"--AtomicInvariantsToUse\" option is empty\n"; 624 } 625 $AtomSymbolSpecified = 0; 626 @AtomicInvariantsWords = split /\,/, $Options{atomicinvariantstouse}; 627 for $AtomicInvariant (@AtomicInvariantsWords) { 628 if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($AtomicInvariant)) { 629 die "Error: Atomic invariant specified, $AtomicInvariant, using \"--AtomicInvariantsToUse\" option is not valid...\n "; 630 } 631 if ($AtomicInvariant =~ /^(AS|AtomSymbol)$/i) { 632 $AtomSymbolSpecified = 1; 633 } 634 push @{$OptionsInfo{AtomicInvariantsToUse}}, $AtomicInvariant; 635 } 636 if (!$AtomSymbolSpecified) { 637 die "Error: Atomic invariant, AS or AtomSymbol, must be specified as using \"--AtomicInvariantsToUse\" option...\n "; 638 } 639 } 640 641 # Process specified functional classes invariants to use... 642 # 643 sub ProcessFunctionalClassesToUse { 644 my($FunctionalClass, @FunctionalClassesToUseWords); 645 646 @{$OptionsInfo{FunctionalClassesToUse}} = (); 647 if (IsEmpty($Options{functionalclassestouse})) { 648 die "Error: Functional classes value specified using \"--FunctionalClassesToUse\" option is empty\n"; 649 } 650 @FunctionalClassesToUseWords = split /\,/, $Options{functionalclassestouse}; 651 for $FunctionalClass (@FunctionalClassesToUseWords) { 652 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($FunctionalClass)) { 653 die "Error: Functional class specified, $FunctionalClass, using \"--FunctionalClassesToUse\" option is not valid...\n "; 654 } 655 push @{$OptionsInfo{FunctionalClassesToUse}}, $FunctionalClass; 656 } 657 } 658 659 # Setup script usage and retrieve command line arguments specified using various options... 660 sub SetupScriptUsage { 661 662 # Retrieve all the options... 663 %Options = (); 664 665 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel'; 666 667 $Options{atomidentifiertype} = 'AtomicInvariantsAtomTypes'; 668 $Options{atomicinvariantstouse} = 'AS,X,BO,H,FC'; 669 670 $Options{functionalclassestouse} = 'HBD,HBA,PI,NI,Ar,Hal'; 671 672 $Options{compoundidmode} = 'LabelPrefix'; 673 $Options{compoundidlabel} = 'CompoundID'; 674 $Options{datafieldsmode} = 'CompoundID'; 675 676 $Options{filter} = 'Yes'; 677 678 $Options{keeplargestcomponent} = 'Yes'; 679 680 $Options{mindistance} = 1; 681 $Options{maxdistance} = 10; 682 683 $Options{output} = 'text'; 684 $Options{outdelim} = 'comma'; 685 $Options{quote} = 'yes'; 686 687 $Options{usetriangleinequality} = 'No'; 688 689 $Options{vectorstringformat} = 'IDsAndValuesString'; 690 691 if (!GetOptions(\%Options, "aromaticitymodel=s", "atomidentifiertype|a=s", "atomicinvariantstouse=s", "functionalclassestouse=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s", "mindistance=s", "maxdistance=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", , "usetriangleinequality|u=s", "vectorstringformat|v=s", "workingdir|w=s")) { 692 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 693 } 694 if ($Options{workingdir}) { 695 if (! -d $Options{workingdir}) { 696 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 697 } 698 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 699 } 700 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) { 701 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels(); 702 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n"; 703 } 704 if ($Options{atomidentifiertype} !~ /^(AtomicInvariantsAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|FunctionalClassAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 705 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n"; 706 } 707 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 708 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 709 } 710 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { 711 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n"; 712 } 713 if ($Options{filter} !~ /^(Yes|No)$/i) { 714 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n"; 715 } 716 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) { 717 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n"; 718 } 719 if (!IsPositiveInteger($Options{mindistance})) { 720 die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: > 0 \n"; 721 } 722 if (!IsPositiveInteger($Options{maxdistance})) { 723 die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n"; 724 } 725 if ($Options{mindistance} > $Options{maxdistance}) { 726 die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n"; 727 } 728 if ($Options{output} !~ /^(SD|FP|text|all)$/i) { 729 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n"; 730 } 731 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 732 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 733 } 734 if ($Options{quote} !~ /^(Yes|No)$/i) { 735 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 736 } 737 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) { 738 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n"; 739 } 740 if ($Options{usetriangleinequality} !~ /^(Yes|No)$/i) { 741 die "Error: The value specified, $Options{usetriangleinequality}, for option \"-u, --UseTriangleInequality\" is not valid. Allowed values: Yes or No\n"; 742 } 743 if ($Options{vectorstringformat} !~ /^(IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) { 744 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n"; 745 } 746 } 747