MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: MACCSKeysFingerprints.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use TextUtil;
  34 use SDFileUtil;
  35 use MoleculeFileIO;
  36 use FileIO::FingerprintsSDFileIO;
  37 use FileIO::FingerprintsTextFileIO;
  38 use FileIO::FingerprintsFPFileIO;
  39 use Fingerprints::MACCSKeys;
  40 
  41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  42 
  43 # Autoflush STDOUT
  44 $| = 1;
  45 
  46 # Starting message...
  47 $ScriptName = basename($0);
  48 print "\n$ScriptName: Starting...\n\n";
  49 $StartTime = new Benchmark;
  50 
  51 # Get the options and setup script...
  52 SetupScriptUsage();
  53 if ($Options{help} || @ARGV < 1) {
  54   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  55 }
  56 
  57 my(@SDFilesList);
  58 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  59 
  60 # Process options...
  61 print "Processing options...\n";
  62 my(%OptionsInfo);
  63 ProcessOptions();
  64 
  65 # Setup information about input files...
  66 print "Checking input SD file(s)...\n";
  67 my(%SDFilesInfo);
  68 RetrieveSDFilesInfo();
  69 
  70 # Process input files..
  71 my($FileIndex);
  72 if (@SDFilesList > 1) {
  73   print "\nProcessing SD files...\n";
  74 }
  75 for $FileIndex (0 .. $#SDFilesList) {
  76   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  77     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  78     GenerateMACCSKeysFingerprints($FileIndex);
  79   }
  80 }
  81 print "\n$ScriptName:Done...\n\n";
  82 
  83 $EndTime = new Benchmark;
  84 $TotalTime = timediff ($EndTime, $StartTime);
  85 print "Total time: ", timestr($TotalTime), "\n";
  86 
  87 ###############################################################################
  88 
  89 # Generate fingerprints for a SD file...
  90 #
  91 sub GenerateMACCSKeysFingerprints {
  92   my($FileIndex) = @_;
  93   my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
  94 
  95   $SDFile = $SDFilesList[$FileIndex];
  96 
  97   # Setup output files...
  98   #
  99   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
 100 
 101   $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
 102   $MoleculeFileIO->Open();
 103 
 104   $CmpdCount = 0;
 105   $IgnoredCmpdCount = 0;
 106 
 107   COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
 108     $CmpdCount++;
 109 
 110     # Filter compound data before calculating fingerprints...
 111     if ($OptionsInfo{Filter}) {
 112       if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
 113         $IgnoredCmpdCount++;
 114         next COMPOUND;
 115       }
 116     }
 117 
 118     $MACCSKeysFingerprints = GenerateMoleculeFingerprints($Molecule);
 119     if (!$MACCSKeysFingerprints) {
 120       $IgnoredCmpdCount++;
 121       ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
 122       next COMPOUND;
 123     }
 124 
 125     WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 126   }
 127   $MoleculeFileIO->Close();
 128 
 129   if ($NewFPSDFileIO) {
 130     $NewFPSDFileIO->Close();
 131   }
 132   if ($NewFPTextFileIO) {
 133     $NewFPTextFileIO->Close();
 134   }
 135   if ($NewFPFileIO) {
 136     $NewFPFileIO->Close();
 137   }
 138 
 139   WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
 140 }
 141 
 142 # Process compound being ignored due to problems in fingerprints geneation...
 143 #
 144 sub ProcessIgnoredCompound {
 145   my($Mode, $CmpdCount, $Molecule) = @_;
 146   my($CmpdID, $DataFieldLabelAndValuesRef);
 147 
 148   $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 149   $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 150 
 151   MODE: {
 152     if ($Mode =~ /^ContainsNonElementalData$/i) {
 153       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
 154       next MODE;
 155     }
 156 
 157     if ($Mode =~ /^ContainsNoElementalData$/i) {
 158       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
 159       next MODE;
 160     }
 161 
 162     if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
 163       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 164       next MODE;
 165     }
 166     warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 167   }
 168 }
 169 
 170 # Check and filter compounds....
 171 #
 172 sub CheckAndFilterCompound {
 173   my($CmpdCount, $Molecule) = @_;
 174   my($ElementCount, $NonElementCount);
 175 
 176   ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
 177 
 178   if ($NonElementCount) {
 179     ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
 180     return 1;
 181   }
 182 
 183   if (!$ElementCount) {
 184     ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
 185     return 1;
 186   }
 187 
 188   return 0;
 189 }
 190 
 191 # Write out compounds fingerprints generation summary statistics...
 192 #
 193 sub WriteFingerprintsGenerationSummaryStatistics {
 194   my($CmpdCount, $IgnoredCmpdCount) = @_;
 195   my($ProcessedCmpdCount);
 196 
 197   $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
 198 
 199   print "\nNumber of compounds: $CmpdCount\n";
 200   print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
 201   print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
 202 }
 203 
 204 # Open output files...
 205 #
 206 sub SetupAndOpenOutputFiles {
 207   my($FileIndex) = @_;
 208   my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
 209 
 210   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
 211 
 212   # Setup common parameters for fingerprints file IO objects...
 213   #
 214   %FingerprintsFileIOParams = ();
 215   if ($OptionsInfo{Mode} =~ /^MACCSKeyBits$/i) {
 216     %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsBitVectorString', 'BitStringFormat' => $OptionsInfo{BitStringFormat}, 'BitsOrder' => $OptionsInfo{BitsOrder});
 217   }
 218   elsif ($OptionsInfo{Mode} =~ /^MACCSKeyCount$/i) {
 219     %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
 220   }
 221 
 222   if ($OptionsInfo{SDOutput}) {
 223     $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
 224     print "Generating SD file $NewFPSDFile...\n";
 225     $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
 226     $NewFPSDFileIO->Open();
 227   }
 228 
 229   if ($OptionsInfo{FPOutput}) {
 230     $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
 231     print "Generating FP file $NewFPFile...\n";
 232     $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
 233     $NewFPFileIO->Open();
 234   }
 235 
 236   if ($OptionsInfo{TextOutput}) {
 237     my($ColLabelsRef);
 238 
 239     $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
 240     $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
 241 
 242     print "Generating text file $NewFPTextFile...\n";
 243     $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
 244     $NewFPTextFileIO->Open();
 245   }
 246 
 247   return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 248 }
 249 
 250 # Write fingerpritns and other data to appropriate output files...
 251 #
 252 sub WriteDataToOutputFiles {
 253   my($FileIndex, $CmpdCount, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
 254   my($DataFieldLabelAndValuesRef);
 255 
 256   $DataFieldLabelAndValuesRef = undef;
 257   if ($NewFPTextFileIO || $NewFPFileIO) {
 258     $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 259   }
 260 
 261   if ($NewFPSDFileIO) {
 262     my($CmpdString);
 263 
 264     $CmpdString = $Molecule->GetInputMoleculeString();
 265     $NewFPSDFileIO->WriteFingerprints($MACCSKeysFingerprints, $CmpdString);
 266   }
 267 
 268   if ($NewFPTextFileIO) {
 269     my($ColValuesRef);
 270 
 271     $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 272     $NewFPTextFileIO->WriteFingerprints($MACCSKeysFingerprints, $ColValuesRef);
 273   }
 274 
 275   if ($NewFPFileIO) {
 276     my($CompoundID);
 277 
 278     $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 279     $NewFPFileIO->WriteFingerprints($MACCSKeysFingerprints, $CompoundID);
 280   }
 281 }
 282 
 283 # Generate approriate column labels for FPText output file...
 284 #
 285 sub SetupFPTextFileCoulmnLabels {
 286   my($FileIndex) = @_;
 287   my($Line, @ColLabels);
 288 
 289   @ColLabels = ();
 290   if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 291     push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 292   }
 293   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 294     push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 295   }
 296   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 297     push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
 298   }
 299   elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 300     push @ColLabels, $OptionsInfo{CompoundIDLabel};
 301   }
 302   # Add fingerprints label...
 303   push @ColLabels, $OptionsInfo{FingerprintsLabel};
 304 
 305   return \@ColLabels;
 306 }
 307 
 308 # Generate column values FPText output file..
 309 #
 310 sub SetupFPTextFileCoulmnValues {
 311   my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 312   my(@ColValues);
 313 
 314   @ColValues = ();
 315   if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 316     push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 317   }
 318   elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 319     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 320   }
 321   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 322     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 323   }
 324   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 325     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
 326   }
 327 
 328   return \@ColValues;
 329 }
 330 
 331 # Generate compound ID for FP and FPText output files..
 332 #
 333 sub SetupCmpdIDForOutputFiles {
 334   my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 335   my($CmpdID);
 336 
 337   $CmpdID = '';
 338   if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
 339     my($MolName);
 340     $MolName = $Molecule->GetName();
 341     $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
 342   }
 343   elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
 344     $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
 345   }
 346   elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 347     my($SpecifiedDataField);
 348     $SpecifiedDataField = $OptionsInfo{CompoundID};
 349     $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
 350   }
 351   elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
 352     $CmpdID = $Molecule->GetName();
 353   }
 354   return $CmpdID;
 355 }
 356 
 357 # Generate fingerprints for molecule...
 358 #
 359 sub GenerateMoleculeFingerprints {
 360   my($Molecule) = @_;
 361   my($MACCSKeysFingerprints);
 362 
 363   if ($OptionsInfo{KeepLargestComponent}) {
 364     $Molecule->KeepLargestComponent();
 365   }
 366   if (!$Molecule->DetectRings()) {
 367     return undef;
 368   }
 369   $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
 370   $Molecule->DetectAromaticity();
 371 
 372   $MACCSKeysFingerprints = undef;
 373   if ($OptionsInfo{Mode} =~ /^MACCSKeyBits$/i) {
 374     $MACCSKeysFingerprints = new Fingerprints::MACCSKeys('Molecule' => $Molecule, 'Type' => 'MACCSKeyBits', 'Size' => $OptionsInfo{Size});
 375   }
 376   elsif ($OptionsInfo{Mode} =~ /^MACCSKeyCount$/i) {
 377     $MACCSKeysFingerprints = new Fingerprints::MACCSKeys('Molecule' => $Molecule, 'Type' => 'MACCSKeyCount', 'Size' => $OptionsInfo{Size});
 378   }
 379   else {
 380     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: MACCSKeyBits or MACCSKeyCount\n";
 381   }
 382   $MACCSKeysFingerprints->GenerateMACCSKeys();
 383 
 384   return $MACCSKeysFingerprints;
 385 }
 386 
 387 # Retrieve information about SD files...
 388 #
 389 sub RetrieveSDFilesInfo {
 390   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
 391 
 392   %SDFilesInfo = ();
 393   @{$SDFilesInfo{FileOkay}} = ();
 394   @{$SDFilesInfo{OutFileRoot}} = ();
 395   @{$SDFilesInfo{SDOutFileNames}} = ();
 396   @{$SDFilesInfo{FPOutFileNames}} = ();
 397   @{$SDFilesInfo{TextOutFileNames}} = ();
 398   @{$SDFilesInfo{AllDataFieldsRef}} = ();
 399   @{$SDFilesInfo{CommonDataFieldsRef}} = ();
 400 
 401   $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
 402   $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
 403 
 404   FILELIST: for $Index (0 .. $#SDFilesList) {
 405     $SDFile = $SDFilesList[$Index];
 406 
 407     $SDFilesInfo{FileOkay}[$Index] = 0;
 408     $SDFilesInfo{OutFileRoot}[$Index] = '';
 409     $SDFilesInfo{SDOutFileNames}[$Index] = '';
 410     $SDFilesInfo{FPOutFileNames}[$Index] = '';
 411     $SDFilesInfo{TextOutFileNames}[$Index] = '';
 412 
 413     $SDFile = $SDFilesList[$Index];
 414     if (!(-e $SDFile)) {
 415       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 416       next FILELIST;
 417     }
 418     if (!CheckFileType($SDFile, "sd sdf")) {
 419       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 420       next FILELIST;
 421     }
 422 
 423     if ($CheckDataField) {
 424       # Make sure data field exists in SD file..
 425       my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 426 
 427       @CmpdLines = ();
 428       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 429       $CmpdString = ReadCmpdString(\*SDFILE);
 430       close SDFILE;
 431       @CmpdLines = split "\n", $CmpdString;
 432       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 433       $SpecifiedDataField = $OptionsInfo{CompoundID};
 434       if (!exists $DataFieldValues{$SpecifiedDataField}) {
 435         warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using  \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
 436         next FILELIST;
 437       }
 438     }
 439 
 440     $AllDataFieldsRef = '';
 441     $CommonDataFieldsRef = '';
 442     if ($CollectDataFields) {
 443       my($CmpdCount);
 444       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 445       ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 446       close SDFILE;
 447     }
 448 
 449     # Setup output file names...
 450     $FileDir = ""; $FileName = ""; $FileExt = "";
 451     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 452 
 453     $TextOutFileExt = "csv";
 454     if ($Options{outdelim} =~ /^tab$/i) {
 455       $TextOutFileExt = "tsv";
 456     }
 457     $SDOutFileExt = $FileExt;
 458     $FPOutFileExt = "fpf";
 459 
 460     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 461       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 462       if ($RootFileName && $RootFileExt) {
 463         $FileName = $RootFileName;
 464       }
 465       else {
 466         $FileName = $OptionsInfo{OutFileRoot};
 467       }
 468       $OutFileRoot = $FileName;
 469     }
 470     else {
 471       $OutFileRoot = "${FileName}MACCSKeysFP";
 472     }
 473 
 474     $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
 475     $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
 476     $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
 477 
 478     if ($OptionsInfo{SDOutput}) {
 479       if ($SDFile =~ /$NewSDFileName/i) {
 480         warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 481         print "Specify a different name using \"-r --root\" option or use default name.\n";
 482         next FILELIST;
 483       }
 484     }
 485 
 486     if (!$OptionsInfo{OverwriteFiles}) {
 487       # Check SD and text outout files...
 488       if ($OptionsInfo{SDOutput}) {
 489         if (-e $NewSDFileName) {
 490           warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
 491           next FILELIST;
 492         }
 493       }
 494       if ($OptionsInfo{FPOutput}) {
 495         if (-e $NewFPFileName) {
 496           warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
 497           next FILELIST;
 498         }
 499       }
 500       if ($OptionsInfo{TextOutput}) {
 501         if (-e $NewTextFileName) {
 502           warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
 503           next FILELIST;
 504         }
 505       }
 506     }
 507 
 508     $SDFilesInfo{FileOkay}[$Index] = 1;
 509 
 510     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 511     $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
 512     $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
 513     $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
 514 
 515     $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
 516     $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
 517   }
 518 }
 519 
 520 # Process option values...
 521 sub ProcessOptions {
 522   %OptionsInfo = ();
 523 
 524   $OptionsInfo{Mode} = $Options{mode};
 525   $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
 526 
 527   $OptionsInfo{BitsOrder} = $Options{bitsorder};
 528   $OptionsInfo{BitStringFormat} = $Options{bitstringformat};
 529 
 530   $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
 531   $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
 532   $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
 533 
 534   $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
 535 
 536   my(@SpecifiedDataFields);
 537   @SpecifiedDataFields = ();
 538 
 539   @{$OptionsInfo{SpecifiedDataFields}} = ();
 540   $OptionsInfo{CompoundID} = '';
 541 
 542   if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
 543     if ($Options{compoundidmode} =~ /^DataField$/i) {
 544       if (!$Options{compoundid}) {
 545         die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
 546       }
 547       $OptionsInfo{CompoundID} = $Options{compoundid};
 548     }
 549     elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
 550       $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
 551     }
 552   }
 553   elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
 554     if (!$Options{datafields}) {
 555       die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
 556     }
 557     @SpecifiedDataFields = split /\,/, $Options{datafields};
 558     push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
 559   }
 560 
 561   $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'MACCSKeysFingerprints';
 562 
 563   $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
 564 
 565   $OptionsInfo{Output} = $Options{output};
 566   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
 567   $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
 568   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
 569 
 570   $OptionsInfo{OutDelim} = $Options{outdelim};
 571   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 572 
 573   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 574   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 575 
 576   $OptionsInfo{Size} = $Options{size};
 577 
 578   $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat};
 579 }
 580 
 581 # Setup script usage  and retrieve command line arguments specified using various options...
 582 sub SetupScriptUsage {
 583 
 584   # Retrieve all the options...
 585   %Options = ();
 586 
 587   $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
 588 
 589   $Options{bitsorder} = 'Ascending';
 590   $Options{bitstringformat} = 'BinaryString';
 591 
 592   $Options{compoundidmode} = 'LabelPrefix';
 593   $Options{compoundidlabel} = 'CompoundID';
 594   $Options{datafieldsmode} = 'CompoundID';
 595 
 596   $Options{filter} = 'Yes';
 597 
 598   $Options{keeplargestcomponent} = 'Yes';
 599 
 600   $Options{mode} = 'MACCSKeyBits';
 601 
 602   $Options{output} = 'text';
 603   $Options{outdelim} = 'comma';
 604   $Options{quote} = 'yes';
 605 
 606   $Options{size} = 166;
 607 
 608   $Options{vectorstringformat} = 'ValuesString';
 609 
 610   if (!GetOptions(\%Options, "aromaticitymodel=s", "bitsorder=s", "bitstringformat|b=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s",  "help|h", "keeplargestcomponent|k=s", "mode|m=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "size|s=i", "vectorstringformat|v=s", "workingdir|w=s")) {
 611     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 612   }
 613   if ($Options{workingdir}) {
 614     if (! -d $Options{workingdir}) {
 615       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 616     }
 617     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 618   }
 619   if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
 620     my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
 621     die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
 622   }
 623   if ($Options{bitsorder} !~ /^(Ascending|Descending)$/i) {
 624     die "Error: The value specified, $Options{bitsorder}, for option \"--BitsOrder\" is not valid. Allowed values: Ascending or Descending\n";
 625   }
 626   if ($Options{bitstringformat} !~ /^(BinaryString|HexadecimalString)$/i) {
 627     die "Error: The value specified, $Options{bitstringformat}, for option \"-b, --bitstringformat\" is not valid. Allowed values: BinaryString or HexadecimalString\n";
 628   }
 629   if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
 630     die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
 631   }
 632   if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
 633     die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
 634   }
 635   if ($Options{filter} !~ /^(Yes|No)$/i) {
 636     die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
 637   }
 638   if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
 639     die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
 640   }
 641   if ($Options{mode} !~ /^(MACCSKeyBits|MACCSKeyCount)$/i) {
 642     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: MACCSKeyBits or MACCSKeyCount\n";
 643   }
 644   if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
 645     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
 646   }
 647   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 648     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 649   }
 650   if ($Options{quote} !~ /^(Yes|No)$/i) {
 651     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 652   }
 653   if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
 654     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
 655   }
 656   if (!(IsPositiveInteger($Options{size}) && ($Options{size} == 166 || $Options{size} == 322))) {
 657     die "Error: The value specified, $Options{size}, for option \"-s, --size\" is not valid. Allowed values: 166 or 322 \n";
 658   }
 659   if ($Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
 660     die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 661   }
 662 }
 663