1 #!/usr/bin/perl -w 2 # 3 # File: TopologicalPharmacophoreAtomPairsFingerprints.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use TextUtil; 34 use SDFileUtil; 35 use MoleculeFileIO; 36 use FileIO::FingerprintsSDFileIO; 37 use FileIO::FingerprintsTextFileIO; 38 use FileIO::FingerprintsFPFileIO; 39 use AtomTypes::FunctionalClassAtomTypes; 40 use Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints; 41 42 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 43 44 # Autoflush STDOUT 45 $| = 1; 46 47 # Starting message... 48 $ScriptName = basename($0); 49 print "\n$ScriptName: Starting...\n\n"; 50 $StartTime = new Benchmark; 51 52 # Get the options and setup script... 53 SetupScriptUsage(); 54 if ($Options{help} || @ARGV < 1) { 55 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 56 } 57 58 my(@SDFilesList); 59 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 60 61 # Process options... 62 print "Processing options...\n"; 63 my(%OptionsInfo); 64 ProcessOptions(); 65 66 # Setup information about input files... 67 print "Checking input SD file(s)...\n"; 68 my(%SDFilesInfo); 69 RetrieveSDFilesInfo(); 70 71 # Process input files.. 72 my($FileIndex); 73 if (@SDFilesList > 1) { 74 print "\nProcessing SD files...\n"; 75 } 76 for $FileIndex (0 .. $#SDFilesList) { 77 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 78 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 79 GenerateTopologicalPharmacophoreAtomPairsFingerprints($FileIndex); 80 } 81 } 82 print "\n$ScriptName:Done...\n\n"; 83 84 $EndTime = new Benchmark; 85 $TotalTime = timediff ($EndTime, $StartTime); 86 print "Total time: ", timestr($TotalTime), "\n"; 87 88 ############################################################################### 89 90 # Generate fingerprints for a SD file... 91 # 92 sub GenerateTopologicalPharmacophoreAtomPairsFingerprints { 93 my($FileIndex) = @_; 94 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, $SetupOutputFiles); 95 96 $SDFile = $SDFilesList[$FileIndex]; 97 98 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3; 99 $SetupOutputFiles = 1; 100 101 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile); 102 $MoleculeFileIO->Open(); 103 104 $CmpdCount = 0; 105 $IgnoredCmpdCount = 0; 106 107 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) { 108 $CmpdCount++; 109 110 # Filter compound data before calculating fingerprints... 111 if ($OptionsInfo{Filter}) { 112 if (CheckAndFilterCompound($CmpdCount, $Molecule)) { 113 $IgnoredCmpdCount++; 114 next COMPOUND; 115 } 116 } 117 118 $TopologicalPharmacophoreAtomPairsFingerprints = GenerateMoleculeFingerprints($Molecule); 119 if (!$TopologicalPharmacophoreAtomPairsFingerprints) { 120 $IgnoredCmpdCount++; 121 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule); 122 next COMPOUND; 123 } 124 125 if ($SetupOutputFiles) { 126 $SetupOutputFiles = 0; 127 SetupFingerprintsLabelValueIDs($TopologicalPharmacophoreAtomPairsFingerprints); 128 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex); 129 } 130 131 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 132 } 133 $MoleculeFileIO->Close(); 134 135 if ($NewFPSDFileIO) { 136 $NewFPSDFileIO->Close(); 137 } 138 if ($NewFPTextFileIO) { 139 $NewFPTextFileIO->Close(); 140 } 141 if ($NewFPFileIO) { 142 $NewFPFileIO->Close(); 143 } 144 145 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount); 146 } 147 148 # Process compound being ignored due to problems in fingerprints geneation... 149 # 150 sub ProcessIgnoredCompound { 151 my($Mode, $CmpdCount, $Molecule) = @_; 152 my($CmpdID, $DataFieldLabelAndValuesRef); 153 154 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 155 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 156 157 MODE: { 158 if ($Mode =~ /^ContainsNonElementalData$/i) { 159 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n"; 160 next MODE; 161 } 162 163 if ($Mode =~ /^ContainsNoElementalData$/i) { 164 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n"; 165 next MODE; 166 } 167 168 if ($Mode =~ /^FingerprintsGenerationFailed$/i) { 169 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 170 next MODE; 171 } 172 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 173 } 174 } 175 176 # Check and filter compounds.... 177 # 178 sub CheckAndFilterCompound { 179 my($CmpdCount, $Molecule) = @_; 180 my($ElementCount, $NonElementCount); 181 182 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements(); 183 184 if ($NonElementCount) { 185 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule); 186 return 1; 187 } 188 189 if (!$ElementCount) { 190 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule); 191 return 1; 192 } 193 194 return 0; 195 } 196 197 # Write out compounds fingerprints generation summary statistics... 198 # 199 sub WriteFingerprintsGenerationSummaryStatistics { 200 my($CmpdCount, $IgnoredCmpdCount) = @_; 201 my($ProcessedCmpdCount); 202 203 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount; 204 205 print "\nNumber of compounds: $CmpdCount\n"; 206 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n"; 207 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n"; 208 } 209 210 # Append atom pair value IDs to fingerprint label... 211 # 212 sub SetupFingerprintsLabelValueIDs { 213 my($TopologicalPharmacophoreAtomPairsFingerprints) = @_; 214 215 if ($OptionsInfo{AtomPairsSetSizeToUse} =~ /^ArbitrarySize$/i || 216 $OptionsInfo{FingerprintsLabelMode} !~ /^FingerprintsLabelWithIDs$/i) { 217 return; 218 } 219 220 $OptionsInfo{FingerprintsLabel} .= "; Value IDs: " . $TopologicalPharmacophoreAtomPairsFingerprints->GetFingerprintsVector->GetValueIDsString(); 221 } 222 223 # Open output files... 224 # 225 sub SetupAndOpenOutputFiles { 226 my($FileIndex) = @_; 227 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams); 228 229 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3; 230 231 # Setup common parameters for fingerprints file IO objects... 232 # 233 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat}); 234 235 if ($OptionsInfo{SDOutput}) { 236 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex]; 237 print "Generating SD file $NewFPSDFile...\n"; 238 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel}); 239 $NewFPSDFileIO->Open(); 240 } 241 242 if ($OptionsInfo{FPOutput}) { 243 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex]; 244 print "Generating FP file $NewFPFile...\n"; 245 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams); 246 $NewFPFileIO->Open(); 247 } 248 249 if ($OptionsInfo{TextOutput}) { 250 my($ColLabelsRef); 251 252 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex]; 253 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex); 254 255 print "Generating text file $NewFPTextFile...\n"; 256 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote}); 257 $NewFPTextFileIO->Open(); 258 } 259 260 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 261 } 262 263 # Write fingerpritns and other data to appropriate output files... 264 # 265 sub WriteDataToOutputFiles { 266 my($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_; 267 my($DataFieldLabelAndValuesRef); 268 269 $DataFieldLabelAndValuesRef = undef; 270 if ($NewFPTextFileIO || $NewFPFileIO) { 271 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 272 } 273 274 if ($NewFPSDFileIO) { 275 my($CmpdString); 276 277 $CmpdString = $Molecule->GetInputMoleculeString(); 278 $NewFPSDFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $CmpdString); 279 } 280 281 if ($NewFPTextFileIO) { 282 my($ColValuesRef); 283 284 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 285 $NewFPTextFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $ColValuesRef); 286 } 287 288 if ($NewFPFileIO) { 289 my($CompoundID); 290 291 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 292 $NewFPFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $CompoundID); 293 } 294 } 295 296 # Generate approriate column labels for FPText output file... 297 # 298 sub SetupFPTextFileCoulmnLabels { 299 my($FileIndex) = @_; 300 my($Line, @ColLabels); 301 302 @ColLabels = (); 303 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 304 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 305 } 306 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 307 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 308 } 309 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 310 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}}; 311 } 312 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 313 push @ColLabels, $OptionsInfo{CompoundIDLabel}; 314 } 315 # Add fingerprints label... 316 push @ColLabels, $OptionsInfo{FingerprintsLabel}; 317 318 return \@ColLabels; 319 } 320 321 # Generate column values FPText output file.. 322 # 323 sub SetupFPTextFileCoulmnValues { 324 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 325 my(@ColValues); 326 327 @ColValues = (); 328 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 329 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 330 } 331 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 332 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 333 } 334 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 335 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 336 } 337 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 338 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}}; 339 } 340 341 return \@ColValues; 342 } 343 344 # Generate compound ID for FP and FPText output files.. 345 # 346 sub SetupCmpdIDForOutputFiles { 347 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 348 my($CmpdID); 349 350 $CmpdID = ''; 351 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { 352 my($MolName); 353 $MolName = $Molecule->GetName(); 354 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; 355 } 356 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { 357 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; 358 } 359 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { 360 my($SpecifiedDataField); 361 $SpecifiedDataField = $OptionsInfo{CompoundID}; 362 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; 363 } 364 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { 365 $CmpdID = $Molecule->GetName(); 366 } 367 return $CmpdID; 368 } 369 370 # Generate fingerprints for molecule... 371 # 372 sub GenerateMoleculeFingerprints { 373 my($Molecule) = @_; 374 my($TopologicalPharmacophoreAtomPairsFingerprints); 375 376 if ($OptionsInfo{KeepLargestComponent}) { 377 $Molecule->KeepLargestComponent(); 378 } 379 if (!$Molecule->DetectRings()) { 380 return undef; 381 } 382 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel}); 383 $Molecule->DetectAromaticity(); 384 385 if ($OptionsInfo{FuzzifyAtomPairsCount}) { 386 $TopologicalPharmacophoreAtomPairsFingerprints = new Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints('Molecule' => $Molecule, 'AtomPairsSetSizeToUse' => $OptionsInfo{AtomPairsSetSizeToUse}, 'MinDistance' => $OptionsInfo{MinDistance}, 'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomTypesToUse' => \@{$OptionsInfo{AtomTypesToUse}}, , 'NormalizationMethodology' => $OptionsInfo{NormalizationMethodology}, , 'ValuesPrecision' => $OptionsInfo{ValuesPrecision}, 'FuzzifyAtomPairsCount' => $OptionsInfo{FuzzifyAtomPairsCount}, 'FuzzificationMode' => $OptionsInfo{FuzzificationMode}, 'FuzzificationMethodology' => $OptionsInfo{FuzzificationMethodology}, 'FuzzFactor' => $OptionsInfo{FuzzFactor}); 387 } 388 else { 389 $TopologicalPharmacophoreAtomPairsFingerprints = new Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints('Molecule' => $Molecule, 'AtomPairsSetSizeToUse' => $OptionsInfo{AtomPairsSetSizeToUse}, 'MinDistance' => $OptionsInfo{MinDistance}, 'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomTypesToUse' => \@{$OptionsInfo{AtomTypesToUse}}, 'NormalizationMethodology' => $OptionsInfo{NormalizationMethodology}, 'ValuesPrecision' => $OptionsInfo{ValuesPrecision}); 390 } 391 392 # Set atom types weights... 393 if ($OptionsInfo{UseAtomTypesWeight}) { 394 $TopologicalPharmacophoreAtomPairsFingerprints->SetAtomTypesWeight(%{$OptionsInfo{AtomTypesWeight}}); 395 } 396 397 # Generate fingerprints... 398 $TopologicalPharmacophoreAtomPairsFingerprints->GenerateFingerprints(); 399 400 # Make sure fingerprints generation is successful... 401 if (!$TopologicalPharmacophoreAtomPairsFingerprints->IsFingerprintsGenerationSuccessful()) { 402 return undef; 403 } 404 405 return $TopologicalPharmacophoreAtomPairsFingerprints; 406 } 407 408 # Retrieve information about SD files... 409 # 410 sub RetrieveSDFilesInfo { 411 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef); 412 413 %SDFilesInfo = (); 414 @{$SDFilesInfo{FileOkay}} = (); 415 @{$SDFilesInfo{OutFileRoot}} = (); 416 @{$SDFilesInfo{SDOutFileNames}} = (); 417 @{$SDFilesInfo{FPOutFileNames}} = (); 418 @{$SDFilesInfo{TextOutFileNames}} = (); 419 @{$SDFilesInfo{AllDataFieldsRef}} = (); 420 @{$SDFilesInfo{CommonDataFieldsRef}} = (); 421 422 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0; 423 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0; 424 425 FILELIST: for $Index (0 .. $#SDFilesList) { 426 $SDFile = $SDFilesList[$Index]; 427 428 $SDFilesInfo{FileOkay}[$Index] = 0; 429 $SDFilesInfo{OutFileRoot}[$Index] = ''; 430 $SDFilesInfo{SDOutFileNames}[$Index] = ''; 431 $SDFilesInfo{FPOutFileNames}[$Index] = ''; 432 $SDFilesInfo{TextOutFileNames}[$Index] = ''; 433 434 $SDFile = $SDFilesList[$Index]; 435 if (!(-e $SDFile)) { 436 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 437 next FILELIST; 438 } 439 if (!CheckFileType($SDFile, "sd sdf")) { 440 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 441 next FILELIST; 442 } 443 444 if ($CheckDataField) { 445 # Make sure data field exists in SD file.. 446 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); 447 448 @CmpdLines = (); 449 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 450 $CmpdString = ReadCmpdString(\*SDFILE); 451 close SDFILE; 452 @CmpdLines = split "\n", $CmpdString; 453 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 454 $SpecifiedDataField = $OptionsInfo{CompoundID}; 455 if (!exists $DataFieldValues{$SpecifiedDataField}) { 456 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; 457 next FILELIST; 458 } 459 } 460 461 $AllDataFieldsRef = ''; 462 $CommonDataFieldsRef = ''; 463 if ($CollectDataFields) { 464 my($CmpdCount); 465 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 466 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 467 close SDFILE; 468 } 469 470 # Setup output file names... 471 $FileDir = ""; $FileName = ""; $FileExt = ""; 472 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 473 474 $TextOutFileExt = "csv"; 475 if ($Options{outdelim} =~ /^tab$/i) { 476 $TextOutFileExt = "tsv"; 477 } 478 $SDOutFileExt = $FileExt; 479 $FPOutFileExt = "fpf"; 480 481 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 482 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 483 if ($RootFileName && $RootFileExt) { 484 $FileName = $RootFileName; 485 } 486 else { 487 $FileName = $OptionsInfo{OutFileRoot}; 488 } 489 $OutFileRoot = $FileName; 490 } 491 else { 492 $OutFileRoot = "${FileName}TopologicalPharmacophoreAtomPairsFP"; 493 } 494 495 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}"; 496 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}"; 497 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}"; 498 499 if ($OptionsInfo{SDOutput}) { 500 if ($SDFile =~ /$NewSDFileName/i) { 501 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 502 print "Specify a different name using \"-r --root\" option or use default name.\n"; 503 next FILELIST; 504 } 505 } 506 507 if (!$OptionsInfo{OverwriteFiles}) { 508 # Check SD and text outout files... 509 if ($OptionsInfo{SDOutput}) { 510 if (-e $NewSDFileName) { 511 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n"; 512 next FILELIST; 513 } 514 } 515 if ($OptionsInfo{FPOutput}) { 516 if (-e $NewFPFileName) { 517 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n"; 518 next FILELIST; 519 } 520 } 521 if ($OptionsInfo{TextOutput}) { 522 if (-e $NewTextFileName) { 523 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n"; 524 next FILELIST; 525 } 526 } 527 } 528 529 $SDFilesInfo{FileOkay}[$Index] = 1; 530 531 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 532 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName; 533 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName; 534 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName; 535 536 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef; 537 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef; 538 } 539 } 540 541 # Process option values... 542 sub ProcessOptions { 543 %OptionsInfo = (); 544 545 ProcessAtomTypesToUseOption(); 546 ProcessAtomTypesWeightOption(); 547 548 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel}; 549 550 $OptionsInfo{AtomPairsSetSizeToUse} = $Options{atompairssetsizetouse}; 551 552 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; 553 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel}; 554 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode}; 555 556 my(@SpecifiedDataFields); 557 @SpecifiedDataFields = (); 558 559 @{$OptionsInfo{SpecifiedDataFields}} = (); 560 $OptionsInfo{CompoundID} = ''; 561 562 if ($Options{datafieldsmode} =~ /^CompoundID$/i) { 563 if ($Options{compoundidmode} =~ /^DataField$/i) { 564 if (!$Options{compoundid}) { 565 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; 566 } 567 $OptionsInfo{CompoundID} = $Options{compoundid}; 568 } 569 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { 570 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; 571 } 572 } 573 elsif ($Options{datafieldsmode} =~ /^Specify$/i) { 574 if (!$Options{datafields}) { 575 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n"; 576 } 577 @SpecifiedDataFields = split /\,/, $Options{datafields}; 578 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields; 579 } 580 581 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0; 582 583 $OptionsInfo{FingerprintsLabelMode} = $Options{fingerprintslabelmode}; 584 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalPharmacophoreAtomPairsFingerprints'; 585 586 $OptionsInfo{FuzzifyAtomPairsCount} = ($Options{fuzzifyatompairscount} =~ /^Yes$/i) ? 1 : 0; 587 $OptionsInfo{FuzzificationMode} = $Options{fuzzificationmode}; 588 $OptionsInfo{FuzzificationMethodology} = $Options{fuzzificationmethodology}; 589 $OptionsInfo{FuzzFactor} = $Options{fuzzfactor}; 590 591 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0; 592 593 $OptionsInfo{MinDistance} = $Options{mindistance}; 594 $OptionsInfo{MaxDistance} = $Options{maxdistance}; 595 596 $OptionsInfo{NormalizationMethodology} = $Options{normalizationmethodology}; 597 598 $OptionsInfo{Output} = $Options{output}; 599 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0; 600 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0; 601 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0; 602 603 $OptionsInfo{OutDelim} = $Options{outdelim}; 604 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 605 606 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 607 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 608 609 $OptionsInfo{ValuesPrecision} = $Options{valuesprecision}; 610 611 # Setup default vector string format... 612 my($VectorStringFormat); 613 $VectorStringFormat = ''; 614 615 if ($Options{vectorstringformat}) { 616 $VectorStringFormat = $Options{vectorstringformat}; 617 618 if ($Options{atompairssetsizetouse} =~ /^ArbitrarySize$/i && $VectorStringFormat =~ /^ValuesString$/i) { 619 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid for $Options{atompairssetsizetouse} value of \"--AtomPairsSetSizeToUse\" option. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n"; 620 } 621 } 622 else { 623 $VectorStringFormat = ($Options{atompairssetsizetouse} =~ /^FixedSize$/) ? "ValuesString" : "IDsAndValuesString"; 624 } 625 $OptionsInfo{VectorStringFormat} = $VectorStringFormat; 626 } 627 628 # Process atom type to use option... 629 # 630 sub ProcessAtomTypesToUseOption { 631 my($AtomType, $SpecifiedAtomTypesToUse, @AtomTypesWords); 632 633 @{$OptionsInfo{AtomTypesToUse}} = (); 634 if (IsEmpty($Options{atomtypestouse})) { 635 die "Error: Atom types value specified using \"-a, --AtomTypesToUse\" option is empty\n"; 636 } 637 638 $SpecifiedAtomTypesToUse = $Options{atomtypestouse}; 639 $SpecifiedAtomTypesToUse =~ s/ //g; 640 @AtomTypesWords = split /\,/, $SpecifiedAtomTypesToUse; 641 642 for $AtomType (@AtomTypesWords) { 643 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($AtomType)) { 644 die "Error: Atomic type specified, $AtomType, using \"-a, --AtomTypesToUse\" option is not valid...\n "; 645 } 646 push @{$OptionsInfo{AtomTypesToUse}}, $AtomType; 647 } 648 } 649 650 # Process atom types weight option... 651 # 652 sub ProcessAtomTypesWeightOption { 653 my($Index, $AtomType, $AtomTypeWeight, $SpecifiedAtomTypesWeight, @AtomTypesWeightsPairs); 654 655 %{$OptionsInfo{AtomTypesWeight}} = (); 656 657 if (IsEmpty($Options{atomtypesweight})) { 658 die "Error: Atom types weight value specified using \"--AtomTypesWeight\" option is empty\n"; 659 } 660 $OptionsInfo{UseAtomTypesWeight} = ($Options{atomtypesweight} =~ /^None$/i) ? 0 : 1; 661 if (!$OptionsInfo{UseAtomTypesWeight}) { 662 return; 663 } 664 665 # Process specified atom type/weight pairs... 666 $SpecifiedAtomTypesWeight = $Options{atomtypesweight}; 667 $SpecifiedAtomTypesWeight =~ s/ //g; 668 @AtomTypesWeightsPairs = split /\,/, $SpecifiedAtomTypesWeight; 669 670 if (@AtomTypesWeightsPairs % 2) { 671 die "Error: Invalid number of values specified using \"--AtomTypesWeight\" option: It must contain even number of values.\n"; 672 } 673 674 for ($Index = 0; $Index < @AtomTypesWeightsPairs; $Index += 2) { 675 $AtomType = $AtomTypesWeightsPairs[$Index]; $AtomTypeWeight = $AtomTypesWeightsPairs[$Index + 1]; 676 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($AtomType)) { 677 die "Error: Atom type specified, $AtomType, using \"--AtomTypesWeight\" option is not valid\n "; 678 } 679 if (!(IsFloat($AtomTypeWeight) && $AtomTypeWeight >= 0)) { 680 die "Error: Atom type weight specified, $AtomTypeWeight, using option \"--AtomTypesWeight\" is not valid. Allowed values: real numbers >= 0 \n"; 681 } 682 $OptionsInfo{AtomTypesWeight}{$AtomType} = $AtomTypeWeight; 683 } 684 } 685 686 # Setup script usage and retrieve command line arguments specified using various options... 687 sub SetupScriptUsage { 688 689 # Retrieve all the options... 690 %Options = (); 691 692 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel'; 693 694 $Options{atompairssetsizetouse} = 'ArbitrarySize'; 695 696 $Options{atomtypestouse} = 'HBD,HBA,PI,NI,H'; 697 $Options{atomtypesweight} = 'None'; 698 699 $Options{compoundidmode} = 'LabelPrefix'; 700 $Options{compoundidlabel} = 'CompoundID'; 701 $Options{datafieldsmode} = 'CompoundID'; 702 703 $Options{filter} = 'Yes'; 704 705 $Options{fingerprintslabelmode} = 'FingerprintsLabelOnly'; 706 707 $Options{fuzzifyatompairscount} = 'No'; 708 $Options{fuzzificationmode} = 'AfterNormalization'; 709 $Options{fuzzificationmethodology} = 'FuzzyBinning'; 710 $Options{fuzzfactor} = 0.15; 711 712 $Options{keeplargestcomponent} = 'Yes'; 713 714 $Options{mindistance} = 1; 715 $Options{maxdistance} = 10; 716 717 $Options{normalizationmethodology} = 'None'; 718 719 $Options{output} = 'text'; 720 $Options{outdelim} = 'comma'; 721 $Options{quote} = 'yes'; 722 723 $Options{valuesprecision} = 2; 724 725 $Options{vectorstringformat} = ''; 726 727 if (!GetOptions(\%Options, "aromaticitymodel=s", "atompairssetsizetouse=s", "atomtypestouse|a=s", "atomtypesweight=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabelmode=s", "fingerprintslabel=s", "fuzzifyatompairscount=s", "fuzzificationmode=s", "fuzzificationmethodology=s", "fuzzfactor=s", "help|h", "keeplargestcomponent|k=s", "mindistance=s", "maxdistance=s", "normalizationmethodology|n=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "valuesprecision=s", "vectorstringformat|v=s", "workingdir|w=s")) { 728 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 729 } 730 if ($Options{workingdir}) { 731 if (! -d $Options{workingdir}) { 732 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 733 } 734 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 735 } 736 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) { 737 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels(); 738 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n"; 739 } 740 if ($Options{atompairssetsizetouse} !~ /^(ArbitrarySize|FixedSize)$/i) { 741 die "Error: The value specified, $Options{atompairssetsizetouse}, for option \"--AtomPairsSetSizeToUse\" is not valid. Allowed values: ArbitrarySize or FixedSize\n"; 742 } 743 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 744 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 745 } 746 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { 747 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n"; 748 } 749 if ($Options{filter} !~ /^(Yes|No)$/i) { 750 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n"; 751 } 752 if ($Options{fingerprintslabelmode} !~ /^(FingerprintsLabelOnly|FingerprintsLabelWithIDs)$/i) { 753 die "Error: The value specified, $Options{fingerprintslabelmode}, for option \"--FingerprintsLabelMode\" is not valid. Allowed values: FingerprintsLabelOnly or FingerprintsLabelWithIDs\n"; 754 } 755 if ($Options{fuzzifyatompairscount} !~ /^(Yes|No)$/i) { 756 die "Error: The value specified, $Options{fuzzifyatompairscount}, for option \"--FuzzifyAtomPairsCount\" is not valid. Allowed values: Yes or No\n"; 757 } 758 if ($Options{fuzzificationmode} !~ /^(BeforeNormalization|AfterNormalization)$/i) { 759 die "Error: The value specified, $Options{fuzzificationmode}, for option \"--FuzzificationMode\" is not valid. Allowed values: BeforeNormalization or AfterNormalization\n"; 760 } 761 if ($Options{fuzzificationmethodology} !~ /^(FuzzyBinning|FuzzyBinSmoothing)$/i) { 762 die "Error: The value specified, $Options{fuzzificationmethodology}, for option \"--FuzzificationMethodology\" is not valid. Allowed values: FuzzyBinning or FuzzyBinSmoothing\n"; 763 } 764 if (!IsFloat($Options{fuzzfactor})) { 765 die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" is not valid. Allowed values: real numbers >= 0 \n"; 766 } 767 if ($Options{fuzzificationmethodology} !~ /^FuzzyBinning$/i) { 768 if (!($Options{fuzzfactor} >=0 && $Options{fuzzfactor} <= 1.0)) { 769 die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" during FuzzyBinning \"--FuzzificationMethodology\" is not valid. Allowed values: >= 0 and <= 1 \n"; 770 } 771 } 772 elsif ($Options{fuzzificationmethodology} !~ /^FuzzyBinSmoothing$/i) { 773 if (!($Options{fuzzfactor} >=0 && $Options{fuzzfactor} <= 0.5)) { 774 die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" during FuzzyBinSmoothing \"--FuzzificationMethodology\" is not valid. Allowed values: >= 0 and <= 0.5 \n"; 775 } 776 } 777 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) { 778 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n"; 779 } 780 if (!IsInteger($Options{mindistance})) { 781 die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: >= 0 \n"; 782 } 783 if (!IsPositiveInteger($Options{maxdistance})) { 784 die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n"; 785 } 786 if ($Options{mindistance} > $Options{maxdistance}) { 787 die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n"; 788 } 789 if ($Options{normalizationmethodology} !~ /^(None|ByHeavyAtomsCount|ByAtomTypesCount)$/i) { 790 die "Error: The value specified, $Options{normalizationmethodology}, for option \"--NormalizationMethodology\" is not valid. Allowed values: None, ByHeavyAtomsCount, or ByAtomTypesCount\n"; 791 } 792 if ($Options{output} !~ /^(SD|FP|text|all)$/i) { 793 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n"; 794 } 795 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 796 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 797 } 798 if ($Options{quote} !~ /^(Yes|No)$/i) { 799 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 800 } 801 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) { 802 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n"; 803 } 804 if (!IsPositiveInteger($Options{valuesprecision})) { 805 die "Error: The value specified, $Options{valuesprecision}, for option \"--ValuesPrecision\" is not valid. Allowed values: > 0 \n"; 806 } 807 if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) { 808 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n"; 809 } 810 } 811