1 #!/usr/bin/perl -w 2 # 3 # File: EStateIndiciesFingerprints.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use TextUtil; 34 use SDFileUtil; 35 use MoleculeFileIO; 36 use FileIO::FingerprintsSDFileIO; 37 use FileIO::FingerprintsTextFileIO; 38 use FileIO::FingerprintsFPFileIO; 39 use AtomTypes::EStateAtomTypes; 40 use Fingerprints::EStateIndiciesFingerprints; 41 42 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 43 44 # Autoflush STDOUT 45 $| = 1; 46 47 # Starting message... 48 $ScriptName = basename($0); 49 print "\n$ScriptName: Starting...\n\n"; 50 $StartTime = new Benchmark; 51 52 # Get the options and setup script... 53 SetupScriptUsage(); 54 if ($Options{help} || @ARGV < 1) { 55 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 56 } 57 58 my(@SDFilesList); 59 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 60 61 # Process options... 62 print "Processing options...\n"; 63 my(%OptionsInfo); 64 ProcessOptions(); 65 66 # Setup information about input files... 67 print "Checking input SD file(s)...\n"; 68 my(%SDFilesInfo); 69 RetrieveSDFilesInfo(); 70 71 # Process input files.. 72 my($FileIndex); 73 if (@SDFilesList > 1) { 74 print "\nProcessing SD files...\n"; 75 } 76 for $FileIndex (0 .. $#SDFilesList) { 77 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 78 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 79 GenerateEStateIndiciesFingerprints($FileIndex); 80 } 81 } 82 print "\n$ScriptName:Done...\n\n"; 83 84 $EndTime = new Benchmark; 85 $TotalTime = timediff ($EndTime, $StartTime); 86 print "Total time: ", timestr($TotalTime), "\n"; 87 88 ############################################################################### 89 90 # Generate fingerprints for a SD file... 91 # 92 sub GenerateEStateIndiciesFingerprints { 93 my($FileIndex) = @_; 94 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $EStateIndiciesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 95 96 $SDFile = $SDFilesList[$FileIndex]; 97 98 # Setup output files... 99 # 100 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex); 101 102 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile); 103 $MoleculeFileIO->Open(); 104 105 $CmpdCount = 0; 106 $IgnoredCmpdCount = 0; 107 108 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) { 109 $CmpdCount++; 110 111 # Filter compound data before calculating fingerprints... 112 if ($OptionsInfo{Filter}) { 113 if (CheckAndFilterCompound($CmpdCount, $Molecule)) { 114 $IgnoredCmpdCount++; 115 next COMPOUND; 116 } 117 } 118 119 $EStateIndiciesFingerprints = GenerateMoleculeFingerprints($Molecule); 120 if (!$EStateIndiciesFingerprints) { 121 $IgnoredCmpdCount++; 122 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule); 123 next COMPOUND; 124 } 125 126 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $EStateIndiciesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 127 } 128 $MoleculeFileIO->Close(); 129 130 if ($NewFPSDFileIO) { 131 $NewFPSDFileIO->Close(); 132 } 133 if ($NewFPTextFileIO) { 134 $NewFPTextFileIO->Close(); 135 } 136 if ($NewFPFileIO) { 137 $NewFPFileIO->Close(); 138 } 139 140 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount); 141 } 142 143 # Process compound being ignored due to problems in fingerprints geneation... 144 # 145 sub ProcessIgnoredCompound { 146 my($Mode, $CmpdCount, $Molecule) = @_; 147 my($CmpdID, $DataFieldLabelAndValuesRef); 148 149 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 150 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 151 152 MODE: { 153 if ($Mode =~ /^ContainsNonElementalData$/i) { 154 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n"; 155 next MODE; 156 } 157 158 if ($Mode =~ /^ContainsNoElementalData$/i) { 159 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n"; 160 next MODE; 161 } 162 163 if ($Mode =~ /^FingerprintsGenerationFailed$/i) { 164 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 165 next MODE; 166 } 167 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 168 } 169 } 170 171 # Check and filter compounds.... 172 # 173 sub CheckAndFilterCompound { 174 my($CmpdCount, $Molecule) = @_; 175 my($ElementCount, $NonElementCount); 176 177 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements(); 178 179 if ($NonElementCount) { 180 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule); 181 return 1; 182 } 183 184 if (!$ElementCount) { 185 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule); 186 return 1; 187 } 188 189 return 0; 190 } 191 192 # Write out compounds fingerprints generation summary statistics... 193 # 194 sub WriteFingerprintsGenerationSummaryStatistics { 195 my($CmpdCount, $IgnoredCmpdCount) = @_; 196 my($ProcessedCmpdCount); 197 198 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount; 199 200 print "\nNumber of compounds: $CmpdCount\n"; 201 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n"; 202 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n"; 203 } 204 205 # Open output files... 206 # 207 sub SetupAndOpenOutputFiles { 208 my($FileIndex) = @_; 209 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams); 210 211 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3; 212 213 # Setup common parameters for fingerprints file IO objects... 214 # 215 %FingerprintsFileIOParams = (); 216 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat}); 217 218 if ($OptionsInfo{SDOutput}) { 219 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex]; 220 print "Generating SD file $NewFPSDFile...\n"; 221 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel}); 222 $NewFPSDFileIO->Open(); 223 } 224 225 if ($OptionsInfo{FPOutput}) { 226 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex]; 227 print "Generating FP file $NewFPFile...\n"; 228 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams); 229 $NewFPFileIO->Open(); 230 } 231 232 if ($OptionsInfo{TextOutput}) { 233 my($ColLabelsRef); 234 235 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex]; 236 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex); 237 238 print "Generating text file $NewFPTextFile...\n"; 239 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote}); 240 $NewFPTextFileIO->Open(); 241 } 242 243 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 244 } 245 246 # Write fingerpritns and other data to appropriate output files... 247 # 248 sub WriteDataToOutputFiles { 249 my($FileIndex, $CmpdCount, $Molecule, $EStateIndiciesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_; 250 my($DataFieldLabelAndValuesRef); 251 252 $DataFieldLabelAndValuesRef = undef; 253 if ($NewFPTextFileIO || $NewFPFileIO) { 254 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 255 } 256 257 if ($NewFPSDFileIO) { 258 my($CmpdString); 259 260 $CmpdString = $Molecule->GetInputMoleculeString(); 261 $NewFPSDFileIO->WriteFingerprints($EStateIndiciesFingerprints, $CmpdString); 262 } 263 264 if ($NewFPTextFileIO) { 265 my($ColValuesRef); 266 267 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 268 $NewFPTextFileIO->WriteFingerprints($EStateIndiciesFingerprints, $ColValuesRef); 269 } 270 271 if ($NewFPFileIO) { 272 my($CompoundID); 273 274 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 275 $NewFPFileIO->WriteFingerprints($EStateIndiciesFingerprints, $CompoundID); 276 } 277 278 } 279 280 # Generate approriate column labels for FPText output file... 281 # 282 sub SetupFPTextFileCoulmnLabels { 283 my($FileIndex) = @_; 284 my($Line, @ColLabels); 285 286 @ColLabels = (); 287 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 288 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 289 } 290 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 291 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 292 } 293 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 294 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}}; 295 } 296 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 297 push @ColLabels, $OptionsInfo{CompoundIDLabel}; 298 } 299 # Add fingerprints label... 300 push @ColLabels, $OptionsInfo{FingerprintsLabel}; 301 302 return \@ColLabels; 303 } 304 305 # Generate column values FPText output file.. 306 # 307 sub SetupFPTextFileCoulmnValues { 308 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 309 my(@ColValues); 310 311 @ColValues = (); 312 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 313 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 314 } 315 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 316 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 317 } 318 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 319 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 320 } 321 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 322 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}}; 323 } 324 325 return \@ColValues; 326 } 327 328 # Generate compound ID for FP and FPText output files.. 329 # 330 sub SetupCmpdIDForOutputFiles { 331 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 332 my($CmpdID); 333 334 $CmpdID = ''; 335 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { 336 my($MolName); 337 $MolName = $Molecule->GetName(); 338 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; 339 } 340 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { 341 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; 342 } 343 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { 344 my($SpecifiedDataField); 345 $SpecifiedDataField = $OptionsInfo{CompoundID}; 346 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; 347 } 348 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { 349 $CmpdID = $Molecule->GetName(); 350 } 351 return $CmpdID; 352 } 353 354 # Generate fingerprints for molecule... 355 # 356 sub GenerateMoleculeFingerprints { 357 my($Molecule) = @_; 358 my($EStateIndiciesFingerprints); 359 360 if ($OptionsInfo{KeepLargestComponent}) { 361 $Molecule->KeepLargestComponent(); 362 } 363 if (!$Molecule->DetectRings()) { 364 return undef; 365 } 366 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel}); 367 $Molecule->DetectAromaticity(); 368 369 $EStateIndiciesFingerprints = new Fingerprints::EStateIndiciesFingerprints('Molecule' => $Molecule, 'EStateAtomTypesSetToUse' => $OptionsInfo{EStateAtomTypesSetToUse}, 'ValuesPrecision' => $OptionsInfo{ValuesPrecision}); 370 371 # Generate E-state indicies fingerprints... 372 $EStateIndiciesFingerprints->GenerateFingerprints(); 373 374 # Make sure E-state indicies fingerprints generation is successful... 375 if (!$EStateIndiciesFingerprints->IsFingerprintsGenerationSuccessful()) { 376 return undef; 377 } 378 379 return $EStateIndiciesFingerprints; 380 } 381 382 # Retrieve information about SD files... 383 # 384 sub RetrieveSDFilesInfo { 385 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef); 386 387 %SDFilesInfo = (); 388 @{$SDFilesInfo{FileOkay}} = (); 389 @{$SDFilesInfo{OutFileRoot}} = (); 390 @{$SDFilesInfo{SDOutFileNames}} = (); 391 @{$SDFilesInfo{FPOutFileNames}} = (); 392 @{$SDFilesInfo{TextOutFileNames}} = (); 393 @{$SDFilesInfo{AllDataFieldsRef}} = (); 394 @{$SDFilesInfo{CommonDataFieldsRef}} = (); 395 396 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0; 397 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0; 398 399 FILELIST: for $Index (0 .. $#SDFilesList) { 400 $SDFile = $SDFilesList[$Index]; 401 402 $SDFilesInfo{FileOkay}[$Index] = 0; 403 $SDFilesInfo{OutFileRoot}[$Index] = ''; 404 $SDFilesInfo{SDOutFileNames}[$Index] = ''; 405 $SDFilesInfo{FPOutFileNames}[$Index] = ''; 406 $SDFilesInfo{TextOutFileNames}[$Index] = ''; 407 408 $SDFile = $SDFilesList[$Index]; 409 if (!(-e $SDFile)) { 410 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 411 next FILELIST; 412 } 413 if (!CheckFileType($SDFile, "sd sdf")) { 414 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 415 next FILELIST; 416 } 417 418 if ($CheckDataField) { 419 # Make sure data field exists in SD file.. 420 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); 421 422 @CmpdLines = (); 423 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 424 $CmpdString = ReadCmpdString(\*SDFILE); 425 close SDFILE; 426 @CmpdLines = split "\n", $CmpdString; 427 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 428 $SpecifiedDataField = $OptionsInfo{CompoundID}; 429 if (!exists $DataFieldValues{$SpecifiedDataField}) { 430 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; 431 next FILELIST; 432 } 433 } 434 435 $AllDataFieldsRef = ''; 436 $CommonDataFieldsRef = ''; 437 if ($CollectDataFields) { 438 my($CmpdCount); 439 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 440 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 441 close SDFILE; 442 } 443 444 # Setup output file names... 445 $FileDir = ""; $FileName = ""; $FileExt = ""; 446 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 447 448 $TextOutFileExt = "csv"; 449 if ($Options{outdelim} =~ /^tab$/i) { 450 $TextOutFileExt = "tsv"; 451 } 452 $SDOutFileExt = $FileExt; 453 $FPOutFileExt = "fpf"; 454 455 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 456 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 457 if ($RootFileName && $RootFileExt) { 458 $FileName = $RootFileName; 459 } 460 else { 461 $FileName = $OptionsInfo{OutFileRoot}; 462 } 463 $OutFileRoot = $FileName; 464 } 465 else { 466 $OutFileRoot = "${FileName}EStateIndiciesFP"; 467 } 468 469 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}"; 470 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}"; 471 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}"; 472 473 if ($OptionsInfo{SDOutput}) { 474 if ($SDFile =~ /$NewSDFileName/i) { 475 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 476 print "Specify a different name using \"-r --root\" option or use default name.\n"; 477 next FILELIST; 478 } 479 } 480 481 if (!$OptionsInfo{OverwriteFiles}) { 482 # Check SD and text outout files... 483 if ($OptionsInfo{SDOutput}) { 484 if (-e $NewSDFileName) { 485 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n"; 486 next FILELIST; 487 } 488 } 489 if ($OptionsInfo{FPOutput}) { 490 if (-e $NewFPFileName) { 491 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n"; 492 next FILELIST; 493 } 494 } 495 if ($OptionsInfo{TextOutput}) { 496 if (-e $NewTextFileName) { 497 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n"; 498 next FILELIST; 499 } 500 } 501 } 502 503 $SDFilesInfo{FileOkay}[$Index] = 1; 504 505 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 506 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName; 507 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName; 508 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName; 509 510 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef; 511 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef; 512 } 513 } 514 515 # Process option values... 516 sub ProcessOptions { 517 %OptionsInfo = (); 518 519 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel}; 520 521 $OptionsInfo{EStateAtomTypesSetToUse} = $Options{estateatomtypessettouse} ? $Options{estateatomtypessettouse} : 'ArbitrarySize'; 522 523 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; 524 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel}; 525 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode}; 526 527 my(@SpecifiedDataFields); 528 @SpecifiedDataFields = (); 529 530 @{$OptionsInfo{SpecifiedDataFields}} = (); 531 $OptionsInfo{CompoundID} = ''; 532 533 if ($Options{datafieldsmode} =~ /^CompoundID$/i) { 534 if ($Options{compoundidmode} =~ /^DataField$/i) { 535 if (!$Options{compoundid}) { 536 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; 537 } 538 $OptionsInfo{CompoundID} = $Options{compoundid}; 539 } 540 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { 541 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; 542 } 543 } 544 elsif ($Options{datafieldsmode} =~ /^Specify$/i) { 545 if (!$Options{datafields}) { 546 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n"; 547 } 548 @SpecifiedDataFields = split /\,/, $Options{datafields}; 549 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields; 550 } 551 552 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'EStateIndiciesFingerprints'; 553 554 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0; 555 556 if ($Options{fingerprintslabelmode} =~ /^FingerprintsLabelWithIDs$/) { 557 if ($Options{estateatomtypessettouse} =~ /^FixedSize$/i) { 558 # Append E-state atom types for non-hydrogen atoms to the fingerprints label... 559 my($AtomType, @IDs); 560 @IDs = (); 561 for $AtomType (@{AtomTypes::EStateAtomTypes::GetAllPossibleEStateNonHydrogenAtomTypes()}) { 562 push @IDs, "S${AtomType}"; 563 } 564 $OptionsInfo{FingerprintsLabel} .= "; EStateAtomTypes: " . TextUtil::JoinWords(\@IDs, " ", 0); 565 } 566 } 567 $OptionsInfo{FingerprintsLabelMode} = $Options{fingerprintslabelmode}; 568 569 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0; 570 571 $OptionsInfo{Output} = $Options{output}; 572 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0; 573 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0; 574 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0; 575 576 $OptionsInfo{OutDelim} = $Options{outdelim}; 577 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 578 579 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 580 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 581 582 # Precision for E-state indicies... 583 $OptionsInfo{ValuesPrecision} = $Options{valuesprecision}; 584 585 # Setup default vector string format... 586 my($VectorStringFormat); 587 $VectorStringFormat = ''; 588 if ($Options{vectorstringformat}) { 589 $VectorStringFormat = $Options{vectorstringformat}; 590 } 591 else { 592 $VectorStringFormat = ($Options{estateatomtypessettouse} =~ /^FixedSize$/) ? "ValuesString" : "IDsAndValuesString"; 593 } 594 $OptionsInfo{VectorStringFormat} = $VectorStringFormat; 595 } 596 597 # Setup script usage and retrieve command line arguments specified using various options... 598 sub SetupScriptUsage { 599 600 # Retrieve all the options... 601 %Options = (); 602 603 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel'; 604 605 $Options{compoundidmode} = 'LabelPrefix'; 606 $Options{compoundidlabel} = 'CompoundID'; 607 $Options{datafieldsmode} = 'CompoundID'; 608 609 $Options{filter} = 'Yes'; 610 611 $Options{estateatomtypessettouse} = 'ArbitrarySize'; 612 613 $Options{fingerprintslabelmode} = 'FingerprintsLabelOnly'; 614 $Options{keeplargestcomponent} = 'Yes'; 615 616 $Options{output} = 'text'; 617 $Options{outdelim} = 'comma'; 618 $Options{quote} = 'yes'; 619 620 $Options{valuesprecision} = 3; 621 622 $Options{vectorstringformat} = ''; 623 624 if (!GetOptions(\%Options, "aromaticitymodel=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "estateatomtypessettouse|e=s", "filter|f=s", "fingerprintslabelmode=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "valuesprecision=s", "vectorstringformat|v=s", "workingdir|w=s")) { 625 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 626 } 627 if ($Options{workingdir}) { 628 if (! -d $Options{workingdir}) { 629 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 630 } 631 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 632 } 633 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) { 634 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels(); 635 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n"; 636 } 637 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 638 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 639 } 640 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { 641 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n"; 642 } 643 if ($Options{estateatomtypessettouse} && $Options{estateatomtypessettouse} !~ /^(ArbitrarySize|FixedSize)$/) { 644 die "Error: The value specified, $Options{estateatomtypessettouse}, for option \"-e, --EStateAtomTypesSetToUse\" is not valid. Allowed values: ArbitrarySize or FixedSize\n"; 645 } 646 if ($Options{filter} !~ /^(Yes|No)$/i) { 647 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n"; 648 } 649 if ($Options{fingerprintslabelmode} !~ /^(FingerprintsLabelOnly|FingerprintsLabelWithIDs)$/i) { 650 die "Error: The value specified, $Options{fingerprintslabelmode}, for option \"--FingerprintsLabelMode\" is not valid. Allowed values: FingerprintsLabelOnly or FingerprintsLabelWithIDs\n"; 651 } 652 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) { 653 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n"; 654 } 655 if ($Options{output} !~ /^(SD|FP|text|all)$/i) { 656 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n"; 657 } 658 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 659 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 660 } 661 if ($Options{quote} !~ /^(Yes|No)$/i) { 662 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 663 } 664 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) { 665 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n"; 666 } 667 if (!IsPositiveInteger($Options{valuesprecision})) { 668 die "Error: The value specified, $Options{valuesprecision}, for option \"--ValuesPrecision\" is not valid. Allowed values: > 0 \n"; 669 } 670 if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) { 671 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n"; 672 } 673 } 674