1 #!/usr/bin/perl -w 2 # 3 # File: CalculatePhysicochemicalProperties.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use TextUtil; 34 use SDFileUtil; 35 use MoleculeFileIO; 36 use Molecule; 37 use AtomTypes::AtomicInvariantsAtomTypes; 38 use AtomTypes::FunctionalClassAtomTypes; 39 use MolecularDescriptors::MolecularDescriptorsGenerator; 40 41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 42 43 # Autoflush STDOUT 44 $| = 1; 45 46 # Starting message... 47 $ScriptName = basename($0); 48 print "\n$ScriptName: Starting...\n\n"; 49 $StartTime = new Benchmark; 50 51 # Get the options and setup script... 52 SetupScriptUsage(); 53 if ($Options{help} || @ARGV < 1) { 54 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 55 } 56 57 my(@SDFilesList); 58 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 59 60 # Process options... 61 print "Processing options...\n"; 62 my(%OptionsInfo); 63 ProcessOptions(); 64 65 # Setup information about input files... 66 print "Checking input SD file(s)...\n"; 67 my(%SDFilesInfo); 68 RetrieveSDFilesInfo(); 69 70 # Process input files.. 71 my($FileIndex); 72 if (@SDFilesList > 1) { 73 print "\nProcessing SD files...\n"; 74 } 75 for $FileIndex (0 .. $#SDFilesList) { 76 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 77 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 78 CalculatePhysicochemicalProperties($FileIndex); 79 } 80 } 81 print "\n$ScriptName:Done...\n\n"; 82 83 $EndTime = new Benchmark; 84 $TotalTime = timediff ($EndTime, $StartTime); 85 print "Total time: ", timestr($TotalTime), "\n"; 86 87 ############################################################################### 88 89 # Calculate physicochemical properties for a SD file... 90 # 91 sub CalculatePhysicochemicalProperties { 92 my($FileIndex) = @_; 93 my($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount, $SDFile, $MoleculeFileIO, $Molecule, $MolecularDescriptorsGenerator, $PhysicochemicalPropertiesDataRef, $NewSDFileRef, $NewTextFileRef); 94 95 $SDFile = $SDFilesList[$FileIndex]; 96 97 # Setup output files... 98 $NewSDFileRef = ''; $NewTextFileRef = ''; 99 ($NewSDFileRef, $NewTextFileRef) = SetupAndOpenOutputFiles($FileIndex); 100 101 # Setup molecular descriptor generator to calculate property values for specifed 102 # property names... 103 $MolecularDescriptorsGenerator = SetupMolecularDescriptorsGenerator(); 104 105 ($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount) = ('0') x 4; 106 107 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile); 108 $MoleculeFileIO->Open(); 109 110 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) { 111 $CmpdCount++; 112 113 # Filter compound data before calculating physiochemical properties... 114 if ($OptionsInfo{Filter}) { 115 if (CheckAndFilterCompound($CmpdCount, $Molecule)) { 116 $IgnoredCmpdCount++; 117 next COMPOUND; 118 } 119 } 120 121 # Calculate properties... 122 $PhysicochemicalPropertiesDataRef = CalculateMoleculeProperties($MolecularDescriptorsGenerator, $Molecule); 123 124 if (!defined($PhysicochemicalPropertiesDataRef)) { 125 $IgnoredCmpdCount++; 126 ProcessIgnoredCompound('PropertiesCalculationFailed', $CmpdCount, $Molecule); 127 next COMPOUND; 128 } 129 130 # Calculate any rule violations... 131 if ($OptionsInfo{RuleOf5Violations} && $PhysicochemicalPropertiesDataRef->{RuleOf5Violations}) { 132 $RuleOf5ViolationsCount++; 133 } 134 135 if ($OptionsInfo{RuleOf3Violations} && $PhysicochemicalPropertiesDataRef->{RuleOf3Violations}) { 136 $RuleOf3ViolationsCount++; 137 } 138 139 # Write out calculate properties... 140 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $PhysicochemicalPropertiesDataRef, $NewSDFileRef, $NewTextFileRef); 141 } 142 $MoleculeFileIO->Close(); 143 144 if ($OptionsInfo{SDOutput} && $NewSDFileRef) { 145 close $NewSDFileRef; 146 } 147 if ($OptionsInfo{TextOutput} && $NewTextFileRef) { 148 close $NewTextFileRef; 149 } 150 151 WriteCalculationSummaryStatistics($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount); 152 } 153 154 # Process compound being ignored due to problems in physicochemical properties calculation... 155 # 156 sub ProcessIgnoredCompound { 157 my($Mode, $CmpdCount, $Molecule) = @_; 158 my($CmpdID, $DataFieldLabelAndValuesRef); 159 160 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 161 $CmpdID = SetupCmpdIDForTextFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 162 163 MODE: { 164 if ($Mode =~ /^ContainsNonElementalData$/i) { 165 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n"; 166 next MODE; 167 } 168 169 if ($Mode =~ /^ContainsNoElementalData$/i) { 170 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n"; 171 next MODE; 172 } 173 174 if ($Mode =~ /^PropertiesCalculationFailed$/i) { 175 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Physicochemical properties calculation didn't succeed...\n\n"; 176 next MODE; 177 } 178 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Physicochemical properties calculation didn't succeed...\n\n"; 179 } 180 } 181 182 # Check and filter compounds.... 183 # 184 sub CheckAndFilterCompound { 185 my($CmpdCount, $Molecule) = @_; 186 my($ElementCount, $NonElementCount); 187 188 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements(); 189 190 if ($NonElementCount) { 191 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule); 192 return 1; 193 } 194 195 if (!$ElementCount) { 196 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule); 197 return 1; 198 } 199 200 return 0; 201 } 202 203 # Write out compounds physicochemical properties calculation summary statistics... 204 # 205 sub WriteCalculationSummaryStatistics { 206 my($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount) = @_; 207 my($ProcessedCmpdCount); 208 209 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount; 210 211 print "\nNumber of compounds: $CmpdCount\n"; 212 print "Number of compounds processed successfully during physicochemical properties calculation: $ProcessedCmpdCount\n"; 213 print "Number of compounds ignored during physicochemical properties calculation: $IgnoredCmpdCount\n"; 214 215 if ($OptionsInfo{RuleOf5Violations}) { 216 print "Number of compounds with one or more RuleOf5 violations: $RuleOf5ViolationsCount\n"; 217 } 218 219 if ($OptionsInfo{RuleOf3Violations}) { 220 print "Number of compounds with one or more RuleOf3 violations: $RuleOf3ViolationsCount\n"; 221 } 222 223 } 224 225 # Open output files... 226 # 227 sub SetupAndOpenOutputFiles { 228 my($FileIndex) = @_; 229 my($NewSDFile, $NewTextFile, $NewSDFileRef, $NewTextFileRef); 230 231 $NewSDFileRef = ''; 232 $NewTextFileRef = ''; 233 234 if ($OptionsInfo{SDOutput}) { 235 $NewSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex]; 236 print "Generating SD file $NewSDFile...\n"; 237 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; 238 $NewSDFileRef = \*NEWSDFILE; 239 } 240 if ($OptionsInfo{TextOutput}) { 241 $NewTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex]; 242 print "Generating text file $NewTextFile...\n"; 243 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n"; 244 WriteTextFileCoulmnLabels($FileIndex, \*NEWTEXTFILE); 245 $NewTextFileRef = \*NEWTEXTFILE; 246 } 247 return ($NewSDFileRef, $NewTextFileRef); 248 } 249 250 # Write calculated physicochemical properties and other data to appropriate output files... 251 # 252 sub WriteDataToOutputFiles { 253 my($FileIndex, $CmpdCount, $Molecule, $PhysicochemicalPropertiesDataRef, $NewSDFileRef, $NewTextFileRef) = @_; 254 my($PropertyName, $PropertyValue); 255 256 if ($OptionsInfo{SDOutput}) { 257 # Retrieve input compound string used to create molecule and write it out 258 # without last line containing a delimiter... 259 my($CmpdString); 260 $CmpdString = $Molecule->GetInputMoleculeString(); 261 $CmpdString =~ s/\$\$\$\$$//; 262 print $NewSDFileRef "$CmpdString"; 263 264 # Write out calculated physicochemical properties data... 265 for $PropertyName (@{$OptionsInfo{SpecifiedPropertyNames}}) { 266 $PropertyValue = $PhysicochemicalPropertiesDataRef->{$PropertyName}; 267 print $NewSDFileRef "> <$PropertyName>\n$PropertyValue\n\n"; 268 } 269 270 # Write out RuleOf5 violations for molecule.... 271 if ($OptionsInfo{RuleOf5Violations}) { 272 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf5Violations}; 273 print $NewSDFileRef "> <RuleOf5Violations>\n$PropertyValue\n\n"; 274 } 275 276 # Write out RuleOf3 violations for molecule.... 277 if ($OptionsInfo{RuleOf3Violations}) { 278 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf3Violations}; 279 print $NewSDFileRef "> <RuleOf3Violations>\n$PropertyValue\n\n"; 280 } 281 282 # Write out delimiter... 283 print $NewSDFileRef "\$\$\$\$\n"; 284 } 285 286 if ($OptionsInfo{TextOutput}) { 287 my($Line, $DataFieldLabelAndValuesRef, $DataFieldLabel, $DataFieldValue, @LineWords,); 288 289 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 290 @LineWords = (); 291 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 292 push @LineWords, SetupCmpdIDForTextFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 293 } 294 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 295 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 296 } 297 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 298 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 299 } 300 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 301 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}}; 302 } 303 304 # Append calculated physicochemical properties data... 305 for $PropertyName (@{$OptionsInfo{SpecifiedPropertyNames}}) { 306 $PropertyValue = $PhysicochemicalPropertiesDataRef->{$PropertyName}; 307 push @LineWords, $PropertyValue; 308 } 309 310 # Write out RuleOf5 violations for molecule.... 311 if ($OptionsInfo{RuleOf5Violations}) { 312 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf5Violations}; 313 push @LineWords, $PropertyValue; 314 } 315 316 # Write out RuleOf3 violations for molecule.... 317 if ($OptionsInfo{RuleOf3Violations}) { 318 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf3Violations}; 319 push @LineWords, $PropertyValue; 320 } 321 322 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 323 print $NewTextFileRef "$Line\n"; 324 } 325 } 326 327 # Write out approriate column labels to text file... 328 sub WriteTextFileCoulmnLabels { 329 my($FileIndex, $NewTextFileRef) = @_; 330 my($Line, @LineWords); 331 332 @LineWords = (); 333 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 334 push @LineWords, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 335 } 336 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 337 push @LineWords, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 338 } 339 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 340 push @LineWords, @{$OptionsInfo{SpecifiedDataFields}}; 341 } 342 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 343 push @LineWords, $OptionsInfo{CompoundIDLabel}; 344 } 345 my($SpecifiedPropertyName); 346 347 # Append physicochemical properties column labels... 348 push @LineWords, @{$OptionsInfo{SpecifiedPropertyNames}}; 349 350 # Write out RuleOf5 violations label... 351 if ($OptionsInfo{RuleOf5Violations}) { 352 push @LineWords, 'RuleOf5Violations'; 353 } 354 355 # Write out RuleOf3 violations label... 356 if ($OptionsInfo{RuleOf3Violations}) { 357 push @LineWords, 'RuleOf3Violations'; 358 } 359 360 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 361 print $NewTextFileRef "$Line\n"; 362 } 363 364 # Generate compound ID for text files.. 365 # 366 sub SetupCmpdIDForTextFiles { 367 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 368 my($CmpdID); 369 370 $CmpdID = ''; 371 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { 372 my($MolName); 373 $MolName = $Molecule->GetName(); 374 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; 375 } 376 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { 377 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; 378 } 379 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { 380 my($SpecifiedDataField); 381 $SpecifiedDataField = $OptionsInfo{CompoundID}; 382 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; 383 } 384 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { 385 $CmpdID = $Molecule->GetName(); 386 } 387 return $CmpdID; 388 } 389 390 # Calculate physicochemical properties for molecule... 391 # 392 sub CalculateMoleculeProperties { 393 my($MolecularDescriptorsGenerator, $Molecule) = @_; 394 my($PropertyName, $PropertyValue, $MolecularDescriptorsObject, %CalculatedPhysicochemicalProperties); 395 396 %CalculatedPhysicochemicalProperties = (); 397 398 if ($OptionsInfo{KeepLargestComponent}) { 399 $Molecule->KeepLargestComponent(); 400 } 401 402 if (!$Molecule->DetectRings()) { 403 return undef; 404 } 405 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel}); 406 $Molecule->DetectAromaticity(); 407 408 if ($OptionsInfo{AddHydrogens}) { 409 $Molecule->AddHydrogens(); 410 } 411 412 # Calculate physicochemical properties... 413 $MolecularDescriptorsGenerator->SetMolecule($Molecule); 414 $MolecularDescriptorsGenerator->GenerateDescriptors(); 415 416 if (!$MolecularDescriptorsGenerator->IsDescriptorsGenerationSuccessful()) { 417 return undef; 418 } 419 420 %CalculatedPhysicochemicalProperties = $MolecularDescriptorsGenerator->GetDescriptorNamesAndValues(); 421 422 # Count RuleOf3 violations... 423 if ($OptionsInfo{RuleOf3Violations}) { 424 CalculateRuleViolationsCount('RuleOf3Violations', \%CalculatedPhysicochemicalProperties); 425 } 426 427 # Count RuleOf5 violations... 428 if ($OptionsInfo{RuleOf5Violations}) { 429 CalculateRuleViolationsCount('RuleOf5Violations', \%CalculatedPhysicochemicalProperties); 430 } 431 432 return \%CalculatedPhysicochemicalProperties; 433 } 434 435 # Setup molecular descriptor generator to calculate property values for specifed 436 # property names... 437 # 438 sub SetupMolecularDescriptorsGenerator { 439 my($PropertyName, $MolecularDescriptorsGenerator); 440 441 $MolecularDescriptorsGenerator = new MolecularDescriptors::MolecularDescriptorsGenerator('Mode' => 'Specify', 'DescriptorNames' => \@{$OptionsInfo{SpecifiedPropertyNames}}); 442 443 # Setup molecular desciptor calculation parameters... 444 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('MolecularWeight')}) || exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('ExactMass')}) ) { 445 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'WeightAndMassDescriptors', %{$OptionsInfo{PrecisionParametersMap}}); 446 } 447 448 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('RotatableBonds')})) { 449 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'RotatableBondsDescriptors', %{$OptionsInfo{RotatableBondsParametersMap}}); 450 } 451 452 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('HydrogenBondDonors')}) || exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('HydrogenBondAcceptors')}) ) { 453 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'HydrogenBondsDescriptors', 'HydrogenBondsType' => $OptionsInfo{HydrogenBonds}); 454 } 455 456 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('TPSA')})) { 457 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'TPSADescriptors', %{$OptionsInfo{TPSAParametersMap}}); 458 } 459 460 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('MolecularComplexity')})) { 461 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'MolecularComplexityDescriptors', %{$OptionsInfo{MolecularComplexityParametersMap}}); 462 } 463 464 return $MolecularDescriptorsGenerator; 465 } 466 467 # Calculate RuleOf3 or RuleOf5 violations count... 468 # 469 sub CalculateRuleViolationsCount { 470 my($RuleViolationsType, $CalculatedPropertiesMapRef) = @_; 471 my($RuleViolationsCount, $PropertyName); 472 473 $RuleViolationsCount = 0; 474 475 RULEVIOLATIONSTYPE: { 476 if ($RuleViolationsType =~ /^RuleOf3Violations$/i) { 477 for $PropertyName (@{$OptionsInfo{RuleOf3PropertyNames}}) { 478 if ($CalculatedPropertiesMapRef->{$PropertyName} > $OptionsInfo{RuleOf3MaxPropertyValuesMap}{$PropertyName}) { 479 $RuleViolationsCount++; 480 } 481 } 482 last RULEVIOLATIONSTYPE; 483 } 484 485 if ($RuleViolationsType =~ /^RuleOf5Violations$/i) { 486 for $PropertyName (@{$OptionsInfo{RuleOf5PropertyNames}}) { 487 if ($CalculatedPropertiesMapRef->{$PropertyName} > $OptionsInfo{RuleOf5MaxPropertyValuesMap}{$PropertyName}) { 488 $RuleViolationsCount++; 489 } 490 } 491 last RULEVIOLATIONSTYPE; 492 } 493 494 die "Warning: Unknown rule violation type: $RuleViolationsType..."; 495 } 496 497 # Set rule violation count... 498 $CalculatedPropertiesMapRef->{$RuleViolationsType} = $RuleViolationsCount; 499 500 } 501 502 # Retrieve information about SD files... 503 # 504 sub RetrieveSDFilesInfo { 505 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $NewSDFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef); 506 507 %SDFilesInfo = (); 508 @{$SDFilesInfo{FileOkay}} = (); 509 @{$SDFilesInfo{OutFileRoot}} = (); 510 @{$SDFilesInfo{SDOutFileNames}} = (); 511 @{$SDFilesInfo{TextOutFileNames}} = (); 512 @{$SDFilesInfo{AllDataFieldsRef}} = (); 513 @{$SDFilesInfo{CommonDataFieldsRef}} = (); 514 515 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0; 516 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0; 517 518 FILELIST: for $Index (0 .. $#SDFilesList) { 519 $SDFile = $SDFilesList[$Index]; 520 521 $SDFilesInfo{FileOkay}[$Index] = 0; 522 $SDFilesInfo{OutFileRoot}[$Index] = ''; 523 $SDFilesInfo{SDOutFileNames}[$Index] = ''; 524 $SDFilesInfo{TextOutFileNames}[$Index] = ''; 525 526 $SDFile = $SDFilesList[$Index]; 527 if (!(-e $SDFile)) { 528 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 529 next FILELIST; 530 } 531 if (!CheckFileType($SDFile, "sd sdf")) { 532 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 533 next FILELIST; 534 } 535 536 if ($CheckDataField) { 537 # Make sure data field exists in SD file.. 538 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); 539 540 @CmpdLines = (); 541 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 542 $CmpdString = ReadCmpdString(\*SDFILE); 543 close SDFILE; 544 @CmpdLines = split "\n", $CmpdString; 545 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 546 $SpecifiedDataField = $OptionsInfo{CompoundID}; 547 if (!exists $DataFieldValues{$SpecifiedDataField}) { 548 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; 549 next FILELIST; 550 } 551 } 552 553 $AllDataFieldsRef = ''; 554 $CommonDataFieldsRef = ''; 555 if ($CollectDataFields) { 556 my($CmpdCount); 557 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 558 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 559 close SDFILE; 560 } 561 562 # Setup output file names... 563 $FileDir = ""; $FileName = ""; $FileExt = ""; 564 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 565 566 $TextOutFileExt = "csv"; 567 if ($Options{outdelim} =~ /^tab$/i) { 568 $TextOutFileExt = "tsv"; 569 } 570 $SDOutFileExt = $FileExt; 571 572 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 573 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 574 if ($RootFileName && $RootFileExt) { 575 $FileName = $RootFileName; 576 } 577 else { 578 $FileName = $OptionsInfo{OutFileRoot}; 579 } 580 $OutFileRoot = $FileName; 581 } 582 else { 583 $OutFileRoot = "${FileName}PhysicochemicalProperties"; 584 } 585 586 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}"; 587 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}"; 588 589 if ($OptionsInfo{SDOutput}) { 590 if ($SDFile =~ /$NewSDFileName/i) { 591 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 592 print "Specify a different name using \"-r --root\" option or use default name.\n"; 593 next FILELIST; 594 } 595 } 596 597 if (!$OptionsInfo{OverwriteFiles}) { 598 # Check SD and text outout files... 599 if ($OptionsInfo{SDOutput}) { 600 if (-e $NewSDFileName) { 601 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n"; 602 next FILELIST; 603 } 604 } 605 if ($OptionsInfo{TextOutput}) { 606 if (-e $NewTextFileName) { 607 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n"; 608 next FILELIST; 609 } 610 } 611 } 612 613 $SDFilesInfo{FileOkay}[$Index] = 1; 614 615 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 616 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName; 617 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName; 618 619 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef; 620 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef; 621 } 622 } 623 624 # Process option values... 625 sub ProcessOptions { 626 %OptionsInfo = (); 627 628 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel}; 629 630 # Process property name related options... 631 ProcessPropertyNamesOption(); 632 633 # Setup RuleOf3 and RuleOf5 violation calculations... 634 $OptionsInfo{RuleOf3Violations} = ($Options{ruleof3violations} =~ /^Yes$/i) ? 1 : 0; 635 $OptionsInfo{RuleOf5Violations} = ($Options{ruleof5violations} =~ /^Yes$/i) ? 1 : 0; 636 637 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; 638 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel}; 639 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode}; 640 641 my(@SpecifiedDataFields); 642 @SpecifiedDataFields = (); 643 644 @{$OptionsInfo{SpecifiedDataFields}} = (); 645 $OptionsInfo{CompoundID} = ''; 646 647 if ($Options{datafieldsmode} =~ /^CompoundID$/i) { 648 if ($Options{compoundidmode} =~ /^DataField$/i) { 649 if (!$Options{compoundid}) { 650 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; 651 } 652 $OptionsInfo{CompoundID} = $Options{compoundid}; 653 } 654 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { 655 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; 656 } 657 } 658 elsif ($Options{datafieldsmode} =~ /^Specify$/i) { 659 if (!$Options{datafields}) { 660 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n"; 661 } 662 @SpecifiedDataFields = split /\,/, $Options{datafields}; 663 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields; 664 } 665 666 # Types of hydrogen bonds... 667 $OptionsInfo{HydrogenBonds} = $Options{hydrogenbonds}; 668 669 # Process precision value parameters... 670 ProcessPrecisionOption(); 671 672 # Process rotatable bonds parameters... 673 ProcessRotatableBondsOption(); 674 675 # Process TPSA parameters... 676 ProcessTPSAOption(); 677 678 # Process molecular complexity parameters... 679 ProcessMolecularComplexityOption(); 680 681 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0; 682 683 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0; 684 685 $OptionsInfo{Output} = $Options{output}; 686 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|Both)$/i) ? 1 : 0; 687 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|Both)$/i) ? 1 : 0; 688 689 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 690 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 691 692 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 693 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 694 } 695 696 # Process property name related options... 697 # 698 sub ProcessPropertyNamesOption { 699 700 # Setup supported physicochemical properties... 701 my($SupportedProperty); 702 703 @{$OptionsInfo{SupportedPropertyNames}} = (); 704 %{$OptionsInfo{SupportedPropertyNamesMap}} = (); 705 706 @{$OptionsInfo{RuleOf5PropertyNames}} = (); 707 %{$OptionsInfo{RuleOf5MaxPropertyValuesMap}} = (); 708 709 @{$OptionsInfo{RuleOf3PropertyNames}} = (); 710 %{$OptionsInfo{RuleOf3MaxPropertyValuesMap}} = (); 711 712 @{$OptionsInfo{DefaultPropertyNames}} = (); 713 714 @{$OptionsInfo{SupportedPropertyNames}} = qw(MolecularWeight ExactMass HeavyAtoms Rings AromaticRings MolecularVolume RotatableBonds HydrogenBondDonors HydrogenBondAcceptors SLogP SMR TPSA Fsp3Carbons Sp3Carbons MolecularComplexity); 715 716 @{$OptionsInfo{RuleOf5PropertyNames}} = qw(MolecularWeight HydrogenBondDonors HydrogenBondAcceptors SLogP); 717 %{$OptionsInfo{RuleOf5MaxPropertyValuesMap}} = ('MolecularWeight' => 500, 'HydrogenBondDonors' => 5, 'HydrogenBondAcceptors' => 10, 'SLogP' => 5); 718 719 @{$OptionsInfo{RuleOf3PropertyNames}} = qw(MolecularWeight RotatableBonds HydrogenBondDonors HydrogenBondAcceptors SLogP TPSA); 720 %{$OptionsInfo{RuleOf3MaxPropertyValuesMap}} = ('MolecularWeight' => 300, 'RotatableBonds' => 3, 'HydrogenBondDonors' => 3, 'HydrogenBondAcceptors' => 3, 'SLogP' => 3, 'TPSA' => 60); 721 722 @{$OptionsInfo{DefaultPropertyNames}} = qw(MolecularWeight HeavyAtoms MolecularVolume RotatableBonds HydrogenBondDonors HydrogenBondAcceptors SLogP TPSA); 723 724 for $SupportedProperty (@{$OptionsInfo{SupportedPropertyNames}}) { 725 $OptionsInfo{SupportedPropertyNamesMap}{lc($SupportedProperty)} = $SupportedProperty; 726 } 727 728 # Process specified properties.... 729 my($SpecifiedPropertyName, @SpecifiedPropertyNames, %SpecifiedPropertyNamesMap); 730 731 @SpecifiedPropertyNames = (); 732 %SpecifiedPropertyNamesMap = (); 733 734 @{$OptionsInfo{SpecifiedPropertyNames}} = (); 735 %{$OptionsInfo{SpecifiedPropertyNamesMap}} = (); 736 737 if ($Options{mode} =~ /^All$/i) { 738 @SpecifiedPropertyNames = @{$OptionsInfo{SupportedPropertyNames}}; 739 } 740 elsif ($Options{mode} =~ /^RuleOf5$/i) { 741 @SpecifiedPropertyNames = @{$OptionsInfo{RuleOf5PropertyNames}}; 742 } 743 elsif ($Options{mode} =~ /^RuleOf3$/i) { 744 @SpecifiedPropertyNames = @{$OptionsInfo{RuleOf3PropertyNames}}; 745 } 746 elsif (IsEmpty($Options{mode})) { 747 @SpecifiedPropertyNames = @{$OptionsInfo{DefaultPropertyNames}}; 748 } 749 else { 750 # Comma delimited lisr of specified property names... 751 my($Mode, $PropertyName, @PropertyNames, @UnsupportedPropertyNames); 752 753 $Mode = $Options{mode}; 754 $Mode =~ s/ //g; 755 756 @PropertyNames = split ",", $Mode; 757 @UnsupportedPropertyNames = (); 758 759 for $PropertyName (@PropertyNames) { 760 if (exists($OptionsInfo{SupportedPropertyNamesMap}{lc($PropertyName)})) { 761 push @SpecifiedPropertyNames, $PropertyName; 762 } 763 else { 764 push @UnsupportedPropertyNames, $PropertyName; 765 } 766 } 767 if (@UnsupportedPropertyNames) { 768 if (@UnsupportedPropertyNames > 1) { 769 warn "Error: The physicochemical property names specified - ", JoinWords(\@UnsupportedPropertyNames, ", ", 0)," - for option \"-m --mode\" are not valid.\n"; 770 } 771 else { 772 warn "Error: The physicochemical property name specified, @UnsupportedPropertyNames , for option \"-m --mode\" is not valid.\n"; 773 } 774 die "Allowed values:", JoinWords(\@{$OptionsInfo{SupportedPropertyNames}}, ", ", 0), "\n"; 775 } 776 if (!@SpecifiedPropertyNames) { 777 die "Error: No valid physicochemical property names specified for option \"-m --mode\".\n"; 778 } 779 } 780 781 # Set up specified property names map... 782 PROPERTY: for $SpecifiedPropertyName (@SpecifiedPropertyNames) { 783 if (exists $SpecifiedPropertyNamesMap{lc($SpecifiedPropertyName)}) { 784 warn "Warning: The physicochemical property name, $SpecifiedPropertyName, is specified multiple times as value of option \"-m --mode\" .\n"; 785 next PROPERTY; 786 } 787 # Canonical specified property name... 788 $SpecifiedPropertyNamesMap{lc($SpecifiedPropertyName)} = $OptionsInfo{SupportedPropertyNamesMap}{lc($SpecifiedPropertyName)}; 789 } 790 791 # Make sure for calculation of RuleOf3Violations, all appropriate property names are specified... 792 if ($Options{ruleof3violations} =~ /^Yes$/i && $Options{mode} =~ /^RuleOf5$/i) { 793 die "Error: The value specified, $Options{ruleof3violations}, for \"--RuleOf3Violations\" option in \"RuleOf5\" \"-m --Mode\" is not valid. You must specify RuleOf3 value for \"-m --Mode\" to calculate RuleOf3 violations.\n"; 794 } 795 796 if ($Options{ruleof3violations} =~ /^Yes$/i) { 797 my($RuleOf3PropertyName, @MissingRuleOf3Names); 798 799 @MissingRuleOf3Names = (); 800 PROPERTY: for $RuleOf3PropertyName (@{$OptionsInfo{RuleOf3PropertyNames}}) { 801 if (exists $SpecifiedPropertyNamesMap{lc($RuleOf3PropertyName)}) { 802 next PROPERTY; 803 } 804 push @MissingRuleOf3Names, $RuleOf3PropertyName; 805 806 # Add property name to specified properties names list and map... 807 push @SpecifiedPropertyNames, $RuleOf3PropertyName; 808 $SpecifiedPropertyNamesMap{lc($RuleOf3PropertyName)} = $OptionsInfo{SupportedPropertyNamesMap}{lc($RuleOf3PropertyName)}; 809 } 810 if (@MissingRuleOf3Names) { 811 warn "Warning: The following physicochemical property names not specified in \"-m --Mode\" option are required for calculating RuleOf3Violations and have been added to the list of property names: @MissingRuleOf3Names\n"; 812 } 813 } 814 815 # Make sure for calculation of RuleOf5Violations, all appropriate property names are specified... 816 if ($Options{ruleof5violations} =~ /^Yes$/i && $Options{mode} =~ /^RuleOf3$/i) { 817 die "Error: The value specified, $Options{ruleof5violations}, for \"--RuleOf5Violations\" option in \"RuleOf3\" \"-m --Mode\" is not valid. You must specify RuleOf5 value for \"-m --Mode\" to calculate RuleOf5 violations.\n"; 818 } 819 820 if ($Options{ruleof5violations} =~ /^Yes$/i) { 821 my($RuleOf5PropertyName, @MissingRuleOf5Names); 822 823 @MissingRuleOf5Names = (); 824 PROPERTY: for $RuleOf5PropertyName (@{$OptionsInfo{RuleOf5PropertyNames}}) { 825 if (exists $SpecifiedPropertyNamesMap{lc($RuleOf5PropertyName)}) { 826 next PROPERTY; 827 } 828 push @MissingRuleOf5Names, $RuleOf5PropertyName; 829 830 # Add property name to specified properties names list and map... 831 push @SpecifiedPropertyNames, $RuleOf5PropertyName; 832 $SpecifiedPropertyNamesMap{lc($RuleOf5PropertyName)} = $OptionsInfo{SupportedPropertyNamesMap}{lc($RuleOf5PropertyName)}; 833 } 834 if (@MissingRuleOf5Names) { 835 warn "Warning: The following physicochemical property names not specified in \"-m --Mode\" option are required for calculating RuleOf5Violations and have been added to the list of property names: @MissingRuleOf5Names\n"; 836 } 837 } 838 $OptionsInfo{Mode} = $Options{mode}; 839 840 # Setup canonical specified property names corresponding to supported names in mixed case... 841 my(@SpecifiedCanonicalPropertyNames); 842 843 @SpecifiedCanonicalPropertyNames = (); 844 for $SpecifiedPropertyName (@SpecifiedPropertyNames) { 845 push @SpecifiedCanonicalPropertyNames, $SpecifiedPropertyNamesMap{lc($SpecifiedPropertyName)}; 846 } 847 @{$OptionsInfo{SpecifiedPropertyNames}} = @SpecifiedCanonicalPropertyNames; 848 %{$OptionsInfo{SpecifiedPropertyNamesMap}} = %SpecifiedPropertyNamesMap; 849 850 # Based on specified property names, figure out whether hydrogens need to be added before 851 # calculation of properties... 852 # 853 $OptionsInfo{AddHydrogens} = 0; 854 if (exists($SpecifiedPropertyNamesMap{lc('MolecularVolume')}) || exists($SpecifiedPropertyNamesMap{lc('SLogP')}) || exists($SpecifiedPropertyNamesMap{lc('SMR')})) { 855 $OptionsInfo{AddHydrogens} = 1; 856 } 857 } 858 859 # Process precision option... 860 # 861 sub ProcessPrecisionOption { 862 my($ParameterName, $ParameterValue, %PrecisionParametersMap, %PrecisionParameterNamesMap); 863 864 %{$OptionsInfo{PrecisionParametersMap}} = (); 865 866 %PrecisionParametersMap = ('WeightPrecision' => 2, 'MassPrecision' => 4); 867 %PrecisionParameterNamesMap = ('molecularweight' => 'WeightPrecision', 'exactmass' => 'MassPrecision'); 868 869 if ($Options{precision}) { 870 # Process specified values... 871 my($Index, $SpecifiedPrecision, @SpecifiedPrecisionValuePairs); 872 873 $SpecifiedPrecision = $Options{precision}; 874 $SpecifiedPrecision =~ s/ //g; 875 @SpecifiedPrecisionValuePairs = split ",", $SpecifiedPrecision; 876 if (@SpecifiedPrecisionValuePairs % 2) { 877 die "Error: Invalid number of values specified using \"--Precision\" option: It must contain even number of values.\n"; 878 } 879 for ($Index = 0; (($Index + 1) < @SpecifiedPrecisionValuePairs); $Index += 2 ) { 880 $ParameterName = $SpecifiedPrecisionValuePairs[$Index]; 881 $ParameterValue = $SpecifiedPrecisionValuePairs[$Index + 1]; 882 if (!exists $PrecisionParameterNamesMap{lc($ParameterName)}) { 883 die "Error: The precision parameter name specified, $ParameterName, for option \"--Precision\" is not valid.\n"; 884 } 885 if (!IsPositiveInteger($ParameterValue)) { 886 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--Precision\" is not valid. Allowed values: positive integer. \n"; 887 } 888 $ParameterName = $PrecisionParameterNamesMap{lc($ParameterName)}; 889 $PrecisionParametersMap{$ParameterName} = $ParameterValue; 890 } 891 } 892 $OptionsInfo{Precision} = $Options{precision}; 893 %{$OptionsInfo{PrecisionParametersMap}} = %PrecisionParametersMap; 894 } 895 896 # Process rotatable bonds option... 897 sub ProcessRotatableBondsOption { 898 my($ParameterName, $ParameterValue, %RotatableBondsParametersMap, %RotatableBondsParameterNamesMap); 899 900 %{$OptionsInfo{RotatableBondsParametersMap}} = (); 901 %RotatableBondsParametersMap = ('IgnoreTerminalBonds' => 1, 'IgnoreBondsToTripleBonds' => 1, 'IgnoreAmideBonds' => 1, 'IgnoreThioamideBonds' => 1, 'IgnoreSulfonamideBonds' => 1); 902 903 for $ParameterName (keys %RotatableBondsParametersMap) { 904 $RotatableBondsParameterNamesMap{lc($ParameterName)} = $ParameterName; 905 } 906 907 if ($Options{rotatablebonds}) { 908 # Process specified values... 909 my($Index, $SpecifiedRotatableBonds, @SpecifiedRotatableBondsValuePairs); 910 911 $SpecifiedRotatableBonds = $Options{rotatablebonds}; 912 $SpecifiedRotatableBonds =~ s/ //g; 913 @SpecifiedRotatableBondsValuePairs = split ",", $SpecifiedRotatableBonds; 914 if (@SpecifiedRotatableBondsValuePairs % 2) { 915 die "Error: Invalid number of values specified using \"--RotatableBonds\" option: It must contain even number of values.\n"; 916 } 917 for ($Index = 0; (($Index + 1) < @SpecifiedRotatableBondsValuePairs); $Index += 2 ) { 918 $ParameterName = $SpecifiedRotatableBondsValuePairs[$Index]; 919 $ParameterValue = $SpecifiedRotatableBondsValuePairs[$Index + 1]; 920 if (!exists $RotatableBondsParameterNamesMap{lc($ParameterName)}) { 921 die "Error: The rotatable bonds parameter name specified, $ParameterName, for option \"--RotatableBonds\" is not valid.\n"; 922 } 923 if ($ParameterValue !~ /^(Yes|No)$/i) { 924 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--RotatableBonds\" is not valid. Allowed values: Yes or No. \n"; 925 } 926 $ParameterName = $RotatableBondsParameterNamesMap{lc($ParameterName)}; 927 $ParameterValue = ($ParameterValue =~ /^Yes$/i) ? 1 : 0; 928 $RotatableBondsParametersMap{$ParameterName} = $ParameterValue; 929 } 930 } 931 $OptionsInfo{RotatableBonds} = $Options{rotatablebonds}; 932 %{$OptionsInfo{RotatableBondsParametersMap}} = %RotatableBondsParametersMap; 933 } 934 935 # Process TPSA option... 936 # 937 sub ProcessTPSAOption { 938 my($ParameterName, $ParameterValue, %TPSAParametersMap, %TPSAParameterNamesMap); 939 940 %{$OptionsInfo{TPSAParametersMap}} = (); 941 942 %TPSAParametersMap = ('IgnorePhosphorus' => 1, 'IgnoreSulfur' => 1); 943 for $ParameterName (keys %TPSAParametersMap) { 944 $TPSAParameterNamesMap{lc($ParameterName)} = $ParameterName; 945 } 946 947 if ($Options{tpsa}) { 948 # Process specified values... 949 my($Index, $SpecifiedTPSA, @SpecifiedTPSAValuePairs); 950 951 $SpecifiedTPSA = $Options{tpsa}; 952 $SpecifiedTPSA =~ s/ //g; 953 @SpecifiedTPSAValuePairs = split ",", $SpecifiedTPSA; 954 if (@SpecifiedTPSAValuePairs % 2) { 955 die "Error: Invalid number of values specified using \"--TPSA\" option: It must contain even number of values.\n"; 956 } 957 for ($Index = 0; (($Index + 1) < @SpecifiedTPSAValuePairs); $Index += 2 ) { 958 $ParameterName = $SpecifiedTPSAValuePairs[$Index]; 959 $ParameterValue = $SpecifiedTPSAValuePairs[$Index + 1]; 960 if (!exists $TPSAParameterNamesMap{lc($ParameterName)}) { 961 die "Error: The TPSA parameter name specified, $ParameterName, for option \"--TPSA\" is not valid.\n"; 962 } 963 if ($ParameterValue !~ /^(Yes|No)$/i) { 964 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--TPSA\" is not valid. Allowed values: Yes or No. \n"; 965 } 966 $ParameterName = $TPSAParameterNamesMap{lc($ParameterName)}; 967 $ParameterValue = ($ParameterValue =~ /^Yes$/i) ? 1 : 0; 968 $TPSAParametersMap{$ParameterName} = $ParameterValue; 969 } 970 } 971 $OptionsInfo{TPSA} = $Options{tpsa}; 972 %{$OptionsInfo{TPSAParametersMap}} = %TPSAParametersMap; 973 } 974 975 # Process molecular complexity parameters... 976 # 977 sub ProcessMolecularComplexityOption { 978 my($MolecularComplexityType, $ParameterName, $ParameterValue, @ParameterNames, @ParameterValues, @AtomIdentifierTypeParameters, %ComplexityParametersMap, %ComplexityParameterNamesMap); 979 980 %{$OptionsInfo{MolecularComplexityParametersMap}} = (); 981 982 %ComplexityParametersMap = ('MolecularComplexityType' => '', 'AtomIdentifierType' => '', 983 'AtomicInvariantsToUse' => '', 'FunctionalClassesToUse' => '', 984 'MACCSKeysSize' => '166', 'NeighborhoodRadius' => '2', 985 'MinPathLength' => '1', 'MaxPathLength' => '8', 'UseBondSymbols' => '1', 986 'MinDistance' => '1', 'MaxDistance' => '10', 'UseTriangleInequality' => '', 987 'DistanceBinSize' => '2', 'NormalizationMethodology' => 'None'); 988 989 %ComplexityParameterNamesMap = (); 990 for $ParameterName (keys %ComplexityParametersMap) { 991 $ComplexityParameterNamesMap{lc($ParameterName)} = $ParameterName; 992 } 993 994 if ($Options{molecularcomplexity}) { 995 # Process specified values... 996 my($Index, $SpecifiedComplexity, @SpecifiedComplexityValuePairs); 997 998 $SpecifiedComplexity = $Options{molecularcomplexity}; 999 1000 @SpecifiedComplexityValuePairs = split ",", $SpecifiedComplexity; 1001 if (@SpecifiedComplexityValuePairs % 2) { 1002 die "Error: Invalid number of values specified using \"--MolecularComplexity\" option: It must contain even number of values.\n"; 1003 } 1004 1005 for ($Index = 0; (($Index + 1) < @SpecifiedComplexityValuePairs); $Index += 2 ) { 1006 $ParameterName = $SpecifiedComplexityValuePairs[$Index]; 1007 $ParameterValue = $SpecifiedComplexityValuePairs[$Index + 1]; 1008 1009 $ParameterName = RemoveLeadingAndTrailingWhiteSpaces($ParameterName); 1010 $ParameterValue = RemoveLeadingAndTrailingWhiteSpaces($ParameterValue); 1011 1012 if (!exists $ComplexityParameterNamesMap{lc($ParameterName)}) { 1013 die "Error: The molecular complexity parameter name specified, $ParameterName, for option \"--MolecularComplexity\" is not valid.\n"; 1014 } 1015 $ParameterName = $ComplexityParameterNamesMap{lc($ParameterName)}; 1016 1017 if ($ParameterName =~ /^AtomicInvariantsToUse$/i) { 1018 my($AtomSymbolFound); 1019 1020 $AtomSymbolFound = 0; 1021 @ParameterValues = split(' ', $ParameterValue); 1022 for $ParameterValue (@ParameterValues) { 1023 if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($ParameterValue)) { 1024 die "Error: The atomic invariant specified, $ParameterValue, for AtomicInvariantsToUse in option \"--MolecularComplexity\" is not valid.\n"; 1025 } 1026 if ($ParameterValue =~ /^(AS|AtomSymbol)$/i) { 1027 $AtomSymbolFound = 1; 1028 } 1029 } 1030 if (!$AtomSymbolFound) { 1031 die "Error: The atomic invariants specified using AtomicInvariantsToUse in option \"--MolecularComplexity\" is not valid: AtomicInvariant atom symbol, AS or AtomSymbol, must be specified.\n"; 1032 } 1033 $ParameterValue = JoinWords(\@ParameterValues, ",", 0); 1034 } 1035 elsif ($ParameterName =~ /^FunctionalClassesToUse$/i) { 1036 @ParameterValues = split(' ', $ParameterValue); 1037 for $ParameterValue (@ParameterValues) { 1038 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($ParameterValue)) { 1039 die "Error: The functional class specified, $ParameterValue, for FunctionalClassesToUse in option \"--MolecularComplexity\" is not valid.\n"; 1040 } 1041 } 1042 $ParameterValue = JoinWords(\@ParameterValues, ",", 0); 1043 } 1044 else { 1045 if ($ParameterValue =~ / /) { 1046 $ParameterValue =~ s/ //g; 1047 } 1048 if ($ParameterValue =~ /^(Yes|No)$/i) { 1049 $ParameterValue = ($ParameterValue =~ /^Yes$/i) ? 1 : 0; 1050 } 1051 } 1052 1053 if ($ParameterName =~ /^MolecularComplexityType$/i) { 1054 if ($ParameterValue !~ /^(AtomTypesFingerprints|ExtendedConnectivityFingerprints|MACCSKeys|PathLengthFingerprints|TopologicalAtomPairsFingerprints|TopologicalAtomTripletsFingerprints|TopologicalAtomTorsionsFingerprints|TopologicalPharmacophoreAtomPairsFingerprints|TopologicalPharmacophoreAtomTripletsFingerprints)$/i) { 1055 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: AtomTypesFingerprints, ExtendedConnectivityFingerprints, MACCSKeys, PathLengthFingerprints, TopologicalAtomPairsFingerprints, TopologicalAtomTripletsFingerprints, TopologicalAtomTorsionsFingerprints, TopologicalPharmacophoreAtomPairsFingerprints, or TopologicalPharmacophoreAtomTripletsFingerprints..\n"; 1056 } 1057 } 1058 elsif ($ParameterName =~ /^AtomIdentifierType$/i) { 1059 if ($ParameterValue !~ /^(AtomicInvariantsAtomTypes|FunctionalClassAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 1060 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, FunctionalClassAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes and UFFAtomTypes.\n"; 1061 } 1062 } 1063 elsif ($ParameterName =~ /^(MACCSKeysSize|MinPathLength|MaxPathLength|MinDistance|MaxDistance|DistanceBinSize)$/i) { 1064 if (!IsPositiveInteger($ParameterValue)) { 1065 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: positive integer. \n"; 1066 } 1067 } 1068 elsif ($ParameterName =~ /^NeighborhoodRadius$/i) { 1069 if (!(IsInteger($ParameterValue) && $ParameterValue >=0)) { 1070 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: 0 or positive integer. \n"; 1071 } 1072 } 1073 elsif ($ParameterName =~ /^NormalizationMethodology$/i) { 1074 if ($ParameterValue !~ /^(None|ByHeavyAtomsCount|ByPossibleKeysCount)$/i) { 1075 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: None, ByHeavyAtomsCount, or ByPossibleKeysCount\n"; 1076 } 1077 } 1078 $ComplexityParametersMap{$ParameterName} = $ParameterValue; 1079 } 1080 1081 if ($ComplexityParametersMap{MACCSKeysSize} !~ /^(166|322)$/i) { 1082 die "Error: The parameter value specified, $ComplexityParametersMap{MACCSKeysSize}, for parameter name, MACCSKeysSize in option \"--MolecularComplexity\" is not valid. Allowed values: 166 or 322\n"; 1083 } 1084 if ($ComplexityParametersMap{MinPathLength} > $ComplexityParametersMap{MaxPathLength}) { 1085 die "Error: The parameter value specified for MinPathLength, $ComplexityParametersMap{MinPathLength}, must be <= MaxPathLength, $ComplexityParametersMap{MaxPathLength} ...\n"; 1086 } 1087 if ($ComplexityParametersMap{MinDistance} > $ComplexityParametersMap{MaxDistance}) { 1088 die "Error: The parameter value specified for MinDistance, $ComplexityParametersMap{MinDistance}, must be <= MaxDistance, $ComplexityParametersMap{MaxDistance} ...\n"; 1089 } 1090 } 1091 1092 # Set default parameter values... 1093 1094 if (IsEmpty($ComplexityParametersMap{MolecularComplexityType})) { 1095 $ComplexityParametersMap{MolecularComplexityType} = 'MACCSKeys'; 1096 } 1097 $MolecularComplexityType = $ComplexityParametersMap{MolecularComplexityType}; 1098 1099 1100 if (IsEmpty($ComplexityParametersMap{AtomIdentifierType})) { 1101 $ComplexityParametersMap{AtomIdentifierType} = ($MolecularComplexityType =~ /^(TopologicalPharmacophoreAtomPairsFingerprints|TopologicalPharmacophoreAtomTripletsFingerprints)$/i) ? "FunctionalClassAtomTypes" : "AtomicInvariantsAtomTypes"; 1102 } 1103 1104 if (IsEmpty($ComplexityParametersMap{AtomicInvariantsToUse})) { 1105 my($AtomicInvariantsToUse); 1106 1107 if ($MolecularComplexityType =~ /^(AtomTypesFingerprints|TopologicalAtomPairsFingerprints|TopologicalAtomTripletsFingerprints|TopologicalAtomTorsionsFingerprints)$/i) { 1108 $AtomicInvariantsToUse = "AS,X,BO,H,FC"; 1109 } 1110 elsif ($MolecularComplexityType =~ /^ExtendedConnectivityFingerprints$/i) { 1111 $AtomicInvariantsToUse = "AS,X,BO,H,FC,MN"; 1112 } 1113 else { 1114 $AtomicInvariantsToUse = "AS"; 1115 } 1116 $ComplexityParametersMap{AtomicInvariantsToUse} = $AtomicInvariantsToUse; 1117 } 1118 1119 if (IsEmpty($ComplexityParametersMap{FunctionalClassesToUse})) { 1120 my($FunctionalClassesToUse); 1121 1122 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomPairsFingerprints$/i) { 1123 $FunctionalClassesToUse = "HBD,HBA,PI,NI,H"; 1124 } 1125 elsif ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomTripletsFingerprints$/i) { 1126 $FunctionalClassesToUse = "HBD,HBA,PI,NI,H,Ar"; 1127 } 1128 else { 1129 $FunctionalClassesToUse = "HBD,HBA,PI,NI,H,Ar,Hal"; 1130 } 1131 $ComplexityParametersMap{FunctionalClassesToUse} = $FunctionalClassesToUse; 1132 } 1133 1134 my(@AtomicInvariantsToUse); 1135 @AtomicInvariantsToUse = split ',', $ComplexityParametersMap{AtomicInvariantsToUse}; 1136 $ComplexityParametersMap{AtomicInvariantsToUse} = \@AtomicInvariantsToUse; 1137 1138 my(@FunctionalClassesToUse); 1139 @FunctionalClassesToUse = split ',', $ComplexityParametersMap{FunctionalClassesToUse}; 1140 $ComplexityParametersMap{FunctionalClassesToUse} = \@FunctionalClassesToUse; 1141 1142 if (IsEmpty($ComplexityParametersMap{UseTriangleInequality})) { 1143 $ComplexityParametersMap{UseTriangleInequality} = 0; 1144 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomTripletsFingerprints$/i) { 1145 $ComplexityParametersMap{UseTriangleInequality} = 1; 1146 } 1147 } 1148 1149 if ($MolecularComplexityType =~ /^(TopologicalPharmacophoreAtomPairsFingerprints|TopologicalPharmacophoreAtomTripletsFingerprints)$/i) { 1150 if ($ComplexityParametersMap{AtomIdentifierType} !~ /^FunctionalClassAtomTypes$/i) { 1151 die "Error: The parameter value specified for AtomIdentifierType, $ComplexityParametersMap{AtomIdentifierType}, in option \"--MolecularComplexity\" is not valid for MolecularComplexityType, $MolecularComplexityType: Allowed value: FunctionalClassAtomTypes...\n"; 1152 } 1153 } 1154 1155 # Set up approprate paremeter names for specified molecular complexity... 1156 1157 @ParameterNames = (); 1158 push @ParameterNames, 'MolecularComplexityType'; 1159 1160 @AtomIdentifierTypeParameters = (); 1161 push @AtomIdentifierTypeParameters, 'AtomIdentifierType'; 1162 if ($ComplexityParametersMap{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) { 1163 push @AtomIdentifierTypeParameters, 'AtomicInvariantsToUse'; 1164 } 1165 elsif ($ComplexityParametersMap{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) { 1166 push @AtomIdentifierTypeParameters, 'FunctionalClassesToUse'; 1167 } 1168 1169 COMPLEXITYTYPE: { 1170 if ($MolecularComplexityType =~ /^AtomTypesFingerprints$/i) { 1171 push @ParameterNames, @AtomIdentifierTypeParameters; 1172 last COMPLEXITYTYPE; 1173 } 1174 if ($MolecularComplexityType =~ /^ExtendedConnectivityFingerprints$/i) { 1175 push @ParameterNames, @AtomIdentifierTypeParameters; 1176 push @ParameterNames, ('NeighborhoodRadius', 'NormalizationMethodology'); 1177 last COMPLEXITYTYPE; 1178 } 1179 if ($MolecularComplexityType =~ /^MACCSKeys$/i) { 1180 push @ParameterNames, 'MACCSKeysSize'; 1181 last COMPLEXITYTYPE; 1182 } 1183 if ($MolecularComplexityType =~ /^PathLengthFingerprints$/i) { 1184 push @ParameterNames, @AtomIdentifierTypeParameters; 1185 push @ParameterNames, ('MinPathLength', 'MaxPathLength', 'UseBondSymbols'); 1186 last COMPLEXITYTYPE; 1187 } 1188 if ($MolecularComplexityType =~ /^TopologicalAtomPairsFingerprints$/i) { 1189 push @ParameterNames, @AtomIdentifierTypeParameters; 1190 push @ParameterNames, ('MinDistance', 'MaxDistance'); 1191 last COMPLEXITYTYPE; 1192 } 1193 if ($MolecularComplexityType =~ /^TopologicalAtomTripletsFingerprints$/i) { 1194 push @ParameterNames, @AtomIdentifierTypeParameters; 1195 push @ParameterNames, ('MinDistance', 'MaxDistance', 'UseTriangleInequality'); 1196 last COMPLEXITYTYPE; 1197 } 1198 if ($MolecularComplexityType =~ /^TopologicalAtomTorsionsFingerprints$/i) { 1199 push @ParameterNames, @AtomIdentifierTypeParameters; 1200 last COMPLEXITYTYPE; 1201 } 1202 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomPairsFingerprints$/i) { 1203 push @ParameterNames, ('AtomIdentifierType', 'FunctionalClassesToUse', 'MinDistance', 'MaxDistance', 'NormalizationMethodology'); 1204 last COMPLEXITYTYPE; 1205 } 1206 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomTripletsFingerprints$/i) { 1207 push @ParameterNames, ('AtomIdentifierType', 'FunctionalClassesToUse', 'MinDistance', 'MaxDistance', 'UseTriangleInequality', 'NormalizationMethodology', 'DistanceBinSize'); 1208 last COMPLEXITYTYPE; 1209 } 1210 die "Error: The parameter value specified, $ParameterValue, for parameter name MolecularComplexityType using \"--MolecularComplexity\" is not valid.\n"; 1211 } 1212 1213 $OptionsInfo{MolecularComplexity} = $Options{molecularcomplexity}; 1214 1215 %{$OptionsInfo{MolecularComplexityParametersMap}} = (); 1216 for $ParameterName (@ParameterNames) { 1217 $ParameterValue = $ComplexityParametersMap{$ParameterName}; 1218 $OptionsInfo{MolecularComplexityParametersMap}{$ParameterName} = $ParameterValue; 1219 } 1220 } 1221 1222 # Setup script usage and retrieve command line arguments specified using various options... 1223 sub SetupScriptUsage { 1224 1225 # Retrieve all the options... 1226 %Options = (); 1227 1228 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel'; 1229 1230 $Options{compoundidmode} = 'LabelPrefix'; 1231 $Options{compoundidlabel} = 'CompoundID'; 1232 $Options{datafieldsmode} = 'CompoundID'; 1233 1234 $Options{filter} = 'Yes'; 1235 1236 $Options{hydrogenbonds} = 'HBondsType2'; 1237 1238 $Options{keeplargestcomponent} = 'Yes'; 1239 1240 # Default mode values are set later... 1241 $Options{mode} = ''; 1242 1243 # Default moelcular complexity values are set later... 1244 $Options{molecularcomplexity} = ''; 1245 1246 # Default precision values are set later... 1247 $Options{precision} = ''; 1248 1249 $Options{output} = 'text'; 1250 $Options{outdelim} = 'comma'; 1251 $Options{quote} = 'yes'; 1252 1253 # Default rotatable bond parameter values are set later... 1254 $Options{rotatablebonds} = ''; 1255 1256 $Options{ruleof3violations} = 'No'; 1257 $Options{ruleof5violations} = 'No'; 1258 1259 # Default TPSA paramater values are set later... 1260 $Options{tpsa} = ''; 1261 1262 if (!GetOptions(\%Options, "aromaticitymodel=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "help|h", "hydrogenbonds=s", "keeplargestcomponent|k=s", "mode|m=s", "molecularcomplexity=s", "outdelim=s", "output=s", "overwrite|o", "precision=s", "rotatablebonds=s", "ruleof3violations=s", "ruleof5violations=s", "quote|q=s", "root|r=s", "tpsa=s", "workingdir|w=s")) { 1263 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1264 } 1265 if ($Options{workingdir}) { 1266 if (! -d $Options{workingdir}) { 1267 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1268 } 1269 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1270 } 1271 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) { 1272 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels(); 1273 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n"; 1274 } 1275 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 1276 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 1277 } 1278 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { 1279 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n"; 1280 } 1281 if ($Options{filter} !~ /^(Yes|No)$/i) { 1282 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n"; 1283 } 1284 if ($Options{hydrogenbonds} !~ /^(HBondsType1|HydrogenBondsType1|HBondsType2|HydrogenBondsType2)$/i) { 1285 die "Error: The value specified, $Options{hydrogenbonds}, for option \"--HydrogenBonds\" is not valid. Allowed values: HBondsType1, HydrogenBondsType1, HBondsType2, HydrogenBondsType2\n"; 1286 } 1287 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) { 1288 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n"; 1289 } 1290 if ($Options{output} !~ /^(SD|text|both)$/i) { 1291 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n"; 1292 } 1293 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 1294 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1295 } 1296 if ($Options{quote} !~ /^(Yes|No)$/i) { 1297 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 1298 } 1299 if ($Options{ruleof3violations} !~ /^(Yes|No)$/i) { 1300 die "Error: The value specified, $Options{ruleof3violations}, for option \"--RuleOf3Violations\" is not valid. Allowed values: Yes or No\n"; 1301 } 1302 if ($Options{ruleof5violations} !~ /^(Yes|No)$/i) { 1303 die "Error: The value specified, $Options{ruleof5violations}, for option \"--RuleOf5Violations\" is not valid. Allowed values: Yes or No\n"; 1304 } 1305 } 1306