1 #!/usr/bin/perl -w 2 # 3 # File: MACCSKeysFingerprints.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use TextUtil; 34 use SDFileUtil; 35 use MoleculeFileIO; 36 use FileIO::FingerprintsSDFileIO; 37 use FileIO::FingerprintsTextFileIO; 38 use FileIO::FingerprintsFPFileIO; 39 use Fingerprints::MACCSKeys; 40 41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 42 43 # Autoflush STDOUT 44 $| = 1; 45 46 # Starting message... 47 $ScriptName = basename($0); 48 print "\n$ScriptName: Starting...\n\n"; 49 $StartTime = new Benchmark; 50 51 # Get the options and setup script... 52 SetupScriptUsage(); 53 if ($Options{help} || @ARGV < 1) { 54 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 55 } 56 57 my(@SDFilesList); 58 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 59 60 # Process options... 61 print "Processing options...\n"; 62 my(%OptionsInfo); 63 ProcessOptions(); 64 65 # Setup information about input files... 66 print "Checking input SD file(s)...\n"; 67 my(%SDFilesInfo); 68 RetrieveSDFilesInfo(); 69 70 # Process input files.. 71 my($FileIndex); 72 if (@SDFilesList > 1) { 73 print "\nProcessing SD files...\n"; 74 } 75 for $FileIndex (0 .. $#SDFilesList) { 76 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 77 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 78 GenerateMACCSKeysFingerprints($FileIndex); 79 } 80 } 81 print "\n$ScriptName:Done...\n\n"; 82 83 $EndTime = new Benchmark; 84 $TotalTime = timediff ($EndTime, $StartTime); 85 print "Total time: ", timestr($TotalTime), "\n"; 86 87 ############################################################################### 88 89 # Generate fingerprints for a SD file... 90 # 91 sub GenerateMACCSKeysFingerprints { 92 my($FileIndex) = @_; 93 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 94 95 $SDFile = $SDFilesList[$FileIndex]; 96 97 # Setup output files... 98 # 99 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex); 100 101 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile); 102 $MoleculeFileIO->Open(); 103 104 $CmpdCount = 0; 105 $IgnoredCmpdCount = 0; 106 107 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) { 108 $CmpdCount++; 109 110 # Filter compound data before calculating fingerprints... 111 if ($OptionsInfo{Filter}) { 112 if (CheckAndFilterCompound($CmpdCount, $Molecule)) { 113 $IgnoredCmpdCount++; 114 next COMPOUND; 115 } 116 } 117 118 $MACCSKeysFingerprints = GenerateMoleculeFingerprints($Molecule); 119 if (!$MACCSKeysFingerprints) { 120 $IgnoredCmpdCount++; 121 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule); 122 next COMPOUND; 123 } 124 125 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 126 } 127 $MoleculeFileIO->Close(); 128 129 if ($NewFPSDFileIO) { 130 $NewFPSDFileIO->Close(); 131 } 132 if ($NewFPTextFileIO) { 133 $NewFPTextFileIO->Close(); 134 } 135 if ($NewFPFileIO) { 136 $NewFPFileIO->Close(); 137 } 138 139 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount); 140 } 141 142 # Process compound being ignored due to problems in fingerprints geneation... 143 # 144 sub ProcessIgnoredCompound { 145 my($Mode, $CmpdCount, $Molecule) = @_; 146 my($CmpdID, $DataFieldLabelAndValuesRef); 147 148 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 149 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 150 151 MODE: { 152 if ($Mode =~ /^ContainsNonElementalData$/i) { 153 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n"; 154 next MODE; 155 } 156 157 if ($Mode =~ /^ContainsNoElementalData$/i) { 158 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n"; 159 next MODE; 160 } 161 162 if ($Mode =~ /^FingerprintsGenerationFailed$/i) { 163 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 164 next MODE; 165 } 166 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; 167 } 168 } 169 170 # Check and filter compounds.... 171 # 172 sub CheckAndFilterCompound { 173 my($CmpdCount, $Molecule) = @_; 174 my($ElementCount, $NonElementCount); 175 176 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements(); 177 178 if ($NonElementCount) { 179 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule); 180 return 1; 181 } 182 183 if (!$ElementCount) { 184 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule); 185 return 1; 186 } 187 188 return 0; 189 } 190 191 # Write out compounds fingerprints generation summary statistics... 192 # 193 sub WriteFingerprintsGenerationSummaryStatistics { 194 my($CmpdCount, $IgnoredCmpdCount) = @_; 195 my($ProcessedCmpdCount); 196 197 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount; 198 199 print "\nNumber of compounds: $CmpdCount\n"; 200 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n"; 201 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n"; 202 } 203 204 # Open output files... 205 # 206 sub SetupAndOpenOutputFiles { 207 my($FileIndex) = @_; 208 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams); 209 210 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3; 211 212 # Setup common parameters for fingerprints file IO objects... 213 # 214 %FingerprintsFileIOParams = (); 215 if ($OptionsInfo{Mode} =~ /^MACCSKeyBits$/i) { 216 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsBitVectorString', 'BitStringFormat' => $OptionsInfo{BitStringFormat}, 'BitsOrder' => $OptionsInfo{BitsOrder}); 217 } 218 elsif ($OptionsInfo{Mode} =~ /^MACCSKeyCount$/i) { 219 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat}); 220 } 221 222 if ($OptionsInfo{SDOutput}) { 223 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex]; 224 print "Generating SD file $NewFPSDFile...\n"; 225 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel}); 226 $NewFPSDFileIO->Open(); 227 } 228 229 if ($OptionsInfo{FPOutput}) { 230 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex]; 231 print "Generating FP file $NewFPFile...\n"; 232 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams); 233 $NewFPFileIO->Open(); 234 } 235 236 if ($OptionsInfo{TextOutput}) { 237 my($ColLabelsRef); 238 239 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex]; 240 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex); 241 242 print "Generating text file $NewFPTextFile...\n"; 243 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote}); 244 $NewFPTextFileIO->Open(); 245 } 246 247 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); 248 } 249 250 # Write fingerpritns and other data to appropriate output files... 251 # 252 sub WriteDataToOutputFiles { 253 my($FileIndex, $CmpdCount, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_; 254 my($DataFieldLabelAndValuesRef); 255 256 $DataFieldLabelAndValuesRef = undef; 257 if ($NewFPTextFileIO || $NewFPFileIO) { 258 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); 259 } 260 261 if ($NewFPSDFileIO) { 262 my($CmpdString); 263 264 $CmpdString = $Molecule->GetInputMoleculeString(); 265 $NewFPSDFileIO->WriteFingerprints($MACCSKeysFingerprints, $CmpdString); 266 } 267 268 if ($NewFPTextFileIO) { 269 my($ColValuesRef); 270 271 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 272 $NewFPTextFileIO->WriteFingerprints($MACCSKeysFingerprints, $ColValuesRef); 273 } 274 275 if ($NewFPFileIO) { 276 my($CompoundID); 277 278 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 279 $NewFPFileIO->WriteFingerprints($MACCSKeysFingerprints, $CompoundID); 280 } 281 } 282 283 # Generate approriate column labels for FPText output file... 284 # 285 sub SetupFPTextFileCoulmnLabels { 286 my($FileIndex) = @_; 287 my($Line, @ColLabels); 288 289 @ColLabels = (); 290 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 291 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 292 } 293 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 294 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 295 } 296 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 297 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}}; 298 } 299 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 300 push @ColLabels, $OptionsInfo{CompoundIDLabel}; 301 } 302 # Add fingerprints label... 303 push @ColLabels, $OptionsInfo{FingerprintsLabel}; 304 305 return \@ColLabels; 306 } 307 308 # Generate column values FPText output file.. 309 # 310 sub SetupFPTextFileCoulmnValues { 311 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 312 my(@ColValues); 313 314 @ColValues = (); 315 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { 316 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); 317 } 318 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { 319 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; 320 } 321 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { 322 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; 323 } 324 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { 325 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}}; 326 } 327 328 return \@ColValues; 329 } 330 331 # Generate compound ID for FP and FPText output files.. 332 # 333 sub SetupCmpdIDForOutputFiles { 334 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; 335 my($CmpdID); 336 337 $CmpdID = ''; 338 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { 339 my($MolName); 340 $MolName = $Molecule->GetName(); 341 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; 342 } 343 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { 344 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; 345 } 346 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { 347 my($SpecifiedDataField); 348 $SpecifiedDataField = $OptionsInfo{CompoundID}; 349 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; 350 } 351 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { 352 $CmpdID = $Molecule->GetName(); 353 } 354 return $CmpdID; 355 } 356 357 # Generate fingerprints for molecule... 358 # 359 sub GenerateMoleculeFingerprints { 360 my($Molecule) = @_; 361 my($MACCSKeysFingerprints); 362 363 if ($OptionsInfo{KeepLargestComponent}) { 364 $Molecule->KeepLargestComponent(); 365 } 366 if (!$Molecule->DetectRings()) { 367 return undef; 368 } 369 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel}); 370 $Molecule->DetectAromaticity(); 371 372 $MACCSKeysFingerprints = undef; 373 if ($OptionsInfo{Mode} =~ /^MACCSKeyBits$/i) { 374 $MACCSKeysFingerprints = new Fingerprints::MACCSKeys('Molecule' => $Molecule, 'Type' => 'MACCSKeyBits', 'Size' => $OptionsInfo{Size}); 375 } 376 elsif ($OptionsInfo{Mode} =~ /^MACCSKeyCount$/i) { 377 $MACCSKeysFingerprints = new Fingerprints::MACCSKeys('Molecule' => $Molecule, 'Type' => 'MACCSKeyCount', 'Size' => $OptionsInfo{Size}); 378 } 379 else { 380 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: MACCSKeyBits or MACCSKeyCount\n"; 381 } 382 $MACCSKeysFingerprints->GenerateMACCSKeys(); 383 384 return $MACCSKeysFingerprints; 385 } 386 387 # Retrieve information about SD files... 388 # 389 sub RetrieveSDFilesInfo { 390 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef); 391 392 %SDFilesInfo = (); 393 @{$SDFilesInfo{FileOkay}} = (); 394 @{$SDFilesInfo{OutFileRoot}} = (); 395 @{$SDFilesInfo{SDOutFileNames}} = (); 396 @{$SDFilesInfo{FPOutFileNames}} = (); 397 @{$SDFilesInfo{TextOutFileNames}} = (); 398 @{$SDFilesInfo{AllDataFieldsRef}} = (); 399 @{$SDFilesInfo{CommonDataFieldsRef}} = (); 400 401 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0; 402 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0; 403 404 FILELIST: for $Index (0 .. $#SDFilesList) { 405 $SDFile = $SDFilesList[$Index]; 406 407 $SDFilesInfo{FileOkay}[$Index] = 0; 408 $SDFilesInfo{OutFileRoot}[$Index] = ''; 409 $SDFilesInfo{SDOutFileNames}[$Index] = ''; 410 $SDFilesInfo{FPOutFileNames}[$Index] = ''; 411 $SDFilesInfo{TextOutFileNames}[$Index] = ''; 412 413 $SDFile = $SDFilesList[$Index]; 414 if (!(-e $SDFile)) { 415 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 416 next FILELIST; 417 } 418 if (!CheckFileType($SDFile, "sd sdf")) { 419 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 420 next FILELIST; 421 } 422 423 if ($CheckDataField) { 424 # Make sure data field exists in SD file.. 425 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); 426 427 @CmpdLines = (); 428 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 429 $CmpdString = ReadCmpdString(\*SDFILE); 430 close SDFILE; 431 @CmpdLines = split "\n", $CmpdString; 432 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 433 $SpecifiedDataField = $OptionsInfo{CompoundID}; 434 if (!exists $DataFieldValues{$SpecifiedDataField}) { 435 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; 436 next FILELIST; 437 } 438 } 439 440 $AllDataFieldsRef = ''; 441 $CommonDataFieldsRef = ''; 442 if ($CollectDataFields) { 443 my($CmpdCount); 444 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 445 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 446 close SDFILE; 447 } 448 449 # Setup output file names... 450 $FileDir = ""; $FileName = ""; $FileExt = ""; 451 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 452 453 $TextOutFileExt = "csv"; 454 if ($Options{outdelim} =~ /^tab$/i) { 455 $TextOutFileExt = "tsv"; 456 } 457 $SDOutFileExt = $FileExt; 458 $FPOutFileExt = "fpf"; 459 460 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 461 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 462 if ($RootFileName && $RootFileExt) { 463 $FileName = $RootFileName; 464 } 465 else { 466 $FileName = $OptionsInfo{OutFileRoot}; 467 } 468 $OutFileRoot = $FileName; 469 } 470 else { 471 $OutFileRoot = "${FileName}MACCSKeysFP"; 472 } 473 474 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}"; 475 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}"; 476 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}"; 477 478 if ($OptionsInfo{SDOutput}) { 479 if ($SDFile =~ /$NewSDFileName/i) { 480 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 481 print "Specify a different name using \"-r --root\" option or use default name.\n"; 482 next FILELIST; 483 } 484 } 485 486 if (!$OptionsInfo{OverwriteFiles}) { 487 # Check SD and text outout files... 488 if ($OptionsInfo{SDOutput}) { 489 if (-e $NewSDFileName) { 490 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n"; 491 next FILELIST; 492 } 493 } 494 if ($OptionsInfo{FPOutput}) { 495 if (-e $NewFPFileName) { 496 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n"; 497 next FILELIST; 498 } 499 } 500 if ($OptionsInfo{TextOutput}) { 501 if (-e $NewTextFileName) { 502 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n"; 503 next FILELIST; 504 } 505 } 506 } 507 508 $SDFilesInfo{FileOkay}[$Index] = 1; 509 510 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 511 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName; 512 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName; 513 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName; 514 515 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef; 516 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef; 517 } 518 } 519 520 # Process option values... 521 sub ProcessOptions { 522 %OptionsInfo = (); 523 524 $OptionsInfo{Mode} = $Options{mode}; 525 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel}; 526 527 $OptionsInfo{BitsOrder} = $Options{bitsorder}; 528 $OptionsInfo{BitStringFormat} = $Options{bitstringformat}; 529 530 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; 531 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel}; 532 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode}; 533 534 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0; 535 536 my(@SpecifiedDataFields); 537 @SpecifiedDataFields = (); 538 539 @{$OptionsInfo{SpecifiedDataFields}} = (); 540 $OptionsInfo{CompoundID} = ''; 541 542 if ($Options{datafieldsmode} =~ /^CompoundID$/i) { 543 if ($Options{compoundidmode} =~ /^DataField$/i) { 544 if (!$Options{compoundid}) { 545 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; 546 } 547 $OptionsInfo{CompoundID} = $Options{compoundid}; 548 } 549 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { 550 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; 551 } 552 } 553 elsif ($Options{datafieldsmode} =~ /^Specify$/i) { 554 if (!$Options{datafields}) { 555 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n"; 556 } 557 @SpecifiedDataFields = split /\,/, $Options{datafields}; 558 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields; 559 } 560 561 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'MACCSKeysFingerprints'; 562 563 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0; 564 565 $OptionsInfo{Output} = $Options{output}; 566 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0; 567 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0; 568 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0; 569 570 $OptionsInfo{OutDelim} = $Options{outdelim}; 571 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 572 573 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 574 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 575 576 $OptionsInfo{Size} = $Options{size}; 577 578 $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat}; 579 } 580 581 # Setup script usage and retrieve command line arguments specified using various options... 582 sub SetupScriptUsage { 583 584 # Retrieve all the options... 585 %Options = (); 586 587 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel'; 588 589 $Options{bitsorder} = 'Ascending'; 590 $Options{bitstringformat} = 'BinaryString'; 591 592 $Options{compoundidmode} = 'LabelPrefix'; 593 $Options{compoundidlabel} = 'CompoundID'; 594 $Options{datafieldsmode} = 'CompoundID'; 595 596 $Options{filter} = 'Yes'; 597 598 $Options{keeplargestcomponent} = 'Yes'; 599 600 $Options{mode} = 'MACCSKeyBits'; 601 602 $Options{output} = 'text'; 603 $Options{outdelim} = 'comma'; 604 $Options{quote} = 'yes'; 605 606 $Options{size} = 166; 607 608 $Options{vectorstringformat} = 'ValuesString'; 609 610 if (!GetOptions(\%Options, "aromaticitymodel=s", "bitsorder=s", "bitstringformat|b=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s", "mode|m=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "size|s=i", "vectorstringformat|v=s", "workingdir|w=s")) { 611 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 612 } 613 if ($Options{workingdir}) { 614 if (! -d $Options{workingdir}) { 615 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 616 } 617 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 618 } 619 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) { 620 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels(); 621 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n"; 622 } 623 if ($Options{bitsorder} !~ /^(Ascending|Descending)$/i) { 624 die "Error: The value specified, $Options{bitsorder}, for option \"--BitsOrder\" is not valid. Allowed values: Ascending or Descending\n"; 625 } 626 if ($Options{bitstringformat} !~ /^(BinaryString|HexadecimalString)$/i) { 627 die "Error: The value specified, $Options{bitstringformat}, for option \"-b, --bitstringformat\" is not valid. Allowed values: BinaryString or HexadecimalString\n"; 628 } 629 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 630 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 631 } 632 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { 633 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n"; 634 } 635 if ($Options{filter} !~ /^(Yes|No)$/i) { 636 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n"; 637 } 638 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) { 639 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n"; 640 } 641 if ($Options{mode} !~ /^(MACCSKeyBits|MACCSKeyCount)$/i) { 642 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: MACCSKeyBits or MACCSKeyCount\n"; 643 } 644 if ($Options{output} !~ /^(SD|FP|text|all)$/i) { 645 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n"; 646 } 647 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 648 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 649 } 650 if ($Options{quote} !~ /^(Yes|No)$/i) { 651 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 652 } 653 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) { 654 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n"; 655 } 656 if (!(IsPositiveInteger($Options{size}) && ($Options{size} == 166 || $Options{size} == 322))) { 657 die "Error: The value specified, $Options{size}, for option \"-s, --size\" is not valid. Allowed values: 166 or 322 \n"; 658 } 659 if ($Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) { 660 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n"; 661 } 662 } 663