MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: EStateIndiciesFingerprints.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use TextUtil;
  34 use SDFileUtil;
  35 use MoleculeFileIO;
  36 use FileIO::FingerprintsSDFileIO;
  37 use FileIO::FingerprintsTextFileIO;
  38 use FileIO::FingerprintsFPFileIO;
  39 use AtomTypes::EStateAtomTypes;
  40 use Fingerprints::EStateIndiciesFingerprints;
  41 
  42 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  43 
  44 # Autoflush STDOUT
  45 $| = 1;
  46 
  47 # Starting message...
  48 $ScriptName = basename($0);
  49 print "\n$ScriptName: Starting...\n\n";
  50 $StartTime = new Benchmark;
  51 
  52 # Get the options and setup script...
  53 SetupScriptUsage();
  54 if ($Options{help} || @ARGV < 1) {
  55   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  56 }
  57 
  58 my(@SDFilesList);
  59 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  60 
  61 # Process options...
  62 print "Processing options...\n";
  63 my(%OptionsInfo);
  64 ProcessOptions();
  65 
  66 # Setup information about input files...
  67 print "Checking input SD file(s)...\n";
  68 my(%SDFilesInfo);
  69 RetrieveSDFilesInfo();
  70 
  71 # Process input files..
  72 my($FileIndex);
  73 if (@SDFilesList > 1) {
  74   print "\nProcessing SD files...\n";
  75 }
  76 for $FileIndex (0 .. $#SDFilesList) {
  77   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  78     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  79     GenerateEStateIndiciesFingerprints($FileIndex);
  80   }
  81 }
  82 print "\n$ScriptName:Done...\n\n";
  83 
  84 $EndTime = new Benchmark;
  85 $TotalTime = timediff ($EndTime, $StartTime);
  86 print "Total time: ", timestr($TotalTime), "\n";
  87 
  88 ###############################################################################
  89 
  90 # Generate fingerprints for a SD file...
  91 #
  92 sub GenerateEStateIndiciesFingerprints {
  93   my($FileIndex) = @_;
  94   my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $EStateIndiciesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
  95 
  96   $SDFile = $SDFilesList[$FileIndex];
  97 
  98   # Setup output files...
  99   #
 100   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
 101 
 102   $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
 103   $MoleculeFileIO->Open();
 104 
 105   $CmpdCount = 0;
 106   $IgnoredCmpdCount = 0;
 107 
 108   COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
 109     $CmpdCount++;
 110 
 111     # Filter compound data before calculating fingerprints...
 112     if ($OptionsInfo{Filter}) {
 113       if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
 114         $IgnoredCmpdCount++;
 115         next COMPOUND;
 116       }
 117     }
 118 
 119     $EStateIndiciesFingerprints = GenerateMoleculeFingerprints($Molecule);
 120     if (!$EStateIndiciesFingerprints) {
 121       $IgnoredCmpdCount++;
 122       ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
 123       next COMPOUND;
 124     }
 125 
 126     WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $EStateIndiciesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 127   }
 128   $MoleculeFileIO->Close();
 129 
 130   if ($NewFPSDFileIO) {
 131     $NewFPSDFileIO->Close();
 132   }
 133   if ($NewFPTextFileIO) {
 134     $NewFPTextFileIO->Close();
 135   }
 136   if ($NewFPFileIO) {
 137     $NewFPFileIO->Close();
 138   }
 139 
 140   WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
 141 }
 142 
 143 # Process compound being ignored due to problems in fingerprints geneation...
 144 #
 145 sub ProcessIgnoredCompound {
 146   my($Mode, $CmpdCount, $Molecule) = @_;
 147   my($CmpdID, $DataFieldLabelAndValuesRef);
 148 
 149   $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 150   $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 151 
 152   MODE: {
 153     if ($Mode =~ /^ContainsNonElementalData$/i) {
 154       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
 155       next MODE;
 156     }
 157 
 158     if ($Mode =~ /^ContainsNoElementalData$/i) {
 159       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
 160       next MODE;
 161     }
 162 
 163     if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
 164       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 165       next MODE;
 166     }
 167     warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 168   }
 169 }
 170 
 171 # Check and filter compounds....
 172 #
 173 sub CheckAndFilterCompound {
 174   my($CmpdCount, $Molecule) = @_;
 175   my($ElementCount, $NonElementCount);
 176 
 177   ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
 178 
 179   if ($NonElementCount) {
 180     ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
 181     return 1;
 182   }
 183 
 184   if (!$ElementCount) {
 185     ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
 186     return 1;
 187   }
 188 
 189   return 0;
 190 }
 191 
 192 # Write out compounds fingerprints generation summary statistics...
 193 #
 194 sub WriteFingerprintsGenerationSummaryStatistics {
 195   my($CmpdCount, $IgnoredCmpdCount) = @_;
 196   my($ProcessedCmpdCount);
 197 
 198   $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
 199 
 200   print "\nNumber of compounds: $CmpdCount\n";
 201   print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
 202   print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
 203 }
 204 
 205 # Open output files...
 206 #
 207 sub SetupAndOpenOutputFiles {
 208   my($FileIndex) = @_;
 209   my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
 210 
 211   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
 212 
 213   # Setup common parameters for fingerprints file IO objects...
 214   #
 215   %FingerprintsFileIOParams = ();
 216   %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
 217 
 218   if ($OptionsInfo{SDOutput}) {
 219     $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
 220     print "Generating SD file $NewFPSDFile...\n";
 221     $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
 222     $NewFPSDFileIO->Open();
 223   }
 224 
 225   if ($OptionsInfo{FPOutput}) {
 226     $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
 227     print "Generating FP file $NewFPFile...\n";
 228     $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
 229     $NewFPFileIO->Open();
 230   }
 231 
 232   if ($OptionsInfo{TextOutput}) {
 233     my($ColLabelsRef);
 234 
 235     $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
 236     $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
 237 
 238     print "Generating text file $NewFPTextFile...\n";
 239     $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
 240     $NewFPTextFileIO->Open();
 241   }
 242 
 243   return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 244 }
 245 
 246 # Write fingerpritns and other data to appropriate output files...
 247 #
 248 sub WriteDataToOutputFiles {
 249   my($FileIndex, $CmpdCount, $Molecule, $EStateIndiciesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
 250   my($DataFieldLabelAndValuesRef);
 251 
 252   $DataFieldLabelAndValuesRef = undef;
 253   if ($NewFPTextFileIO || $NewFPFileIO) {
 254     $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 255   }
 256 
 257   if ($NewFPSDFileIO) {
 258     my($CmpdString);
 259 
 260     $CmpdString = $Molecule->GetInputMoleculeString();
 261     $NewFPSDFileIO->WriteFingerprints($EStateIndiciesFingerprints, $CmpdString);
 262   }
 263 
 264   if ($NewFPTextFileIO) {
 265     my($ColValuesRef);
 266 
 267     $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 268     $NewFPTextFileIO->WriteFingerprints($EStateIndiciesFingerprints, $ColValuesRef);
 269   }
 270 
 271   if ($NewFPFileIO) {
 272     my($CompoundID);
 273 
 274     $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 275     $NewFPFileIO->WriteFingerprints($EStateIndiciesFingerprints, $CompoundID);
 276   }
 277 
 278 }
 279 
 280 # Generate approriate column labels for FPText output file...
 281 #
 282 sub SetupFPTextFileCoulmnLabels {
 283   my($FileIndex) = @_;
 284   my($Line, @ColLabels);
 285 
 286   @ColLabels = ();
 287   if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 288     push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 289   }
 290   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 291     push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 292   }
 293   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 294     push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
 295   }
 296   elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 297     push @ColLabels, $OptionsInfo{CompoundIDLabel};
 298   }
 299   # Add fingerprints label...
 300   push @ColLabels, $OptionsInfo{FingerprintsLabel};
 301 
 302   return \@ColLabels;
 303 }
 304 
 305 # Generate column values FPText output file..
 306 #
 307 sub SetupFPTextFileCoulmnValues {
 308   my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 309   my(@ColValues);
 310 
 311   @ColValues = ();
 312   if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 313     push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 314   }
 315   elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 316     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 317   }
 318   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 319     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 320   }
 321   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 322     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
 323   }
 324 
 325   return \@ColValues;
 326 }
 327 
 328 # Generate compound ID for FP and FPText output files..
 329 #
 330 sub SetupCmpdIDForOutputFiles {
 331   my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 332   my($CmpdID);
 333 
 334   $CmpdID = '';
 335   if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
 336     my($MolName);
 337     $MolName = $Molecule->GetName();
 338     $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
 339   }
 340   elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
 341     $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
 342   }
 343   elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 344     my($SpecifiedDataField);
 345     $SpecifiedDataField = $OptionsInfo{CompoundID};
 346     $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
 347   }
 348   elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
 349     $CmpdID = $Molecule->GetName();
 350   }
 351   return $CmpdID;
 352 }
 353 
 354 # Generate fingerprints for molecule...
 355 #
 356 sub GenerateMoleculeFingerprints {
 357   my($Molecule) = @_;
 358   my($EStateIndiciesFingerprints);
 359 
 360   if ($OptionsInfo{KeepLargestComponent}) {
 361     $Molecule->KeepLargestComponent();
 362   }
 363   if (!$Molecule->DetectRings()) {
 364     return undef;
 365   }
 366   $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
 367   $Molecule->DetectAromaticity();
 368 
 369   $EStateIndiciesFingerprints = new Fingerprints::EStateIndiciesFingerprints('Molecule' => $Molecule, 'EStateAtomTypesSetToUse' => $OptionsInfo{EStateAtomTypesSetToUse}, 'ValuesPrecision' => $OptionsInfo{ValuesPrecision});
 370 
 371   # Generate E-state indicies fingerprints...
 372   $EStateIndiciesFingerprints->GenerateFingerprints();
 373 
 374   # Make sure E-state indicies fingerprints generation is successful...
 375   if (!$EStateIndiciesFingerprints->IsFingerprintsGenerationSuccessful()) {
 376     return undef;
 377   }
 378 
 379   return $EStateIndiciesFingerprints;
 380 }
 381 
 382 # Retrieve information about SD files...
 383 #
 384 sub RetrieveSDFilesInfo {
 385   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
 386 
 387   %SDFilesInfo = ();
 388   @{$SDFilesInfo{FileOkay}} = ();
 389   @{$SDFilesInfo{OutFileRoot}} = ();
 390   @{$SDFilesInfo{SDOutFileNames}} = ();
 391   @{$SDFilesInfo{FPOutFileNames}} = ();
 392   @{$SDFilesInfo{TextOutFileNames}} = ();
 393   @{$SDFilesInfo{AllDataFieldsRef}} = ();
 394   @{$SDFilesInfo{CommonDataFieldsRef}} = ();
 395 
 396   $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
 397   $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
 398 
 399   FILELIST: for $Index (0 .. $#SDFilesList) {
 400     $SDFile = $SDFilesList[$Index];
 401 
 402     $SDFilesInfo{FileOkay}[$Index] = 0;
 403     $SDFilesInfo{OutFileRoot}[$Index] = '';
 404     $SDFilesInfo{SDOutFileNames}[$Index] = '';
 405     $SDFilesInfo{FPOutFileNames}[$Index] = '';
 406     $SDFilesInfo{TextOutFileNames}[$Index] = '';
 407 
 408     $SDFile = $SDFilesList[$Index];
 409     if (!(-e $SDFile)) {
 410       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 411       next FILELIST;
 412     }
 413     if (!CheckFileType($SDFile, "sd sdf")) {
 414       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 415       next FILELIST;
 416     }
 417 
 418     if ($CheckDataField) {
 419       # Make sure data field exists in SD file..
 420       my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 421 
 422       @CmpdLines = ();
 423       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 424       $CmpdString = ReadCmpdString(\*SDFILE);
 425       close SDFILE;
 426       @CmpdLines = split "\n", $CmpdString;
 427       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 428       $SpecifiedDataField = $OptionsInfo{CompoundID};
 429       if (!exists $DataFieldValues{$SpecifiedDataField}) {
 430         warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using  \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
 431         next FILELIST;
 432       }
 433     }
 434 
 435     $AllDataFieldsRef = '';
 436     $CommonDataFieldsRef = '';
 437     if ($CollectDataFields) {
 438       my($CmpdCount);
 439       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 440       ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 441       close SDFILE;
 442     }
 443 
 444     # Setup output file names...
 445     $FileDir = ""; $FileName = ""; $FileExt = "";
 446     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 447 
 448     $TextOutFileExt = "csv";
 449     if ($Options{outdelim} =~ /^tab$/i) {
 450       $TextOutFileExt = "tsv";
 451     }
 452     $SDOutFileExt = $FileExt;
 453     $FPOutFileExt = "fpf";
 454 
 455     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 456       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 457       if ($RootFileName && $RootFileExt) {
 458         $FileName = $RootFileName;
 459       }
 460       else {
 461         $FileName = $OptionsInfo{OutFileRoot};
 462       }
 463       $OutFileRoot = $FileName;
 464     }
 465     else {
 466       $OutFileRoot = "${FileName}EStateIndiciesFP";
 467     }
 468 
 469     $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
 470     $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
 471     $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
 472 
 473     if ($OptionsInfo{SDOutput}) {
 474       if ($SDFile =~ /$NewSDFileName/i) {
 475         warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 476         print "Specify a different name using \"-r --root\" option or use default name.\n";
 477         next FILELIST;
 478       }
 479     }
 480 
 481     if (!$OptionsInfo{OverwriteFiles}) {
 482       # Check SD and text outout files...
 483       if ($OptionsInfo{SDOutput}) {
 484         if (-e $NewSDFileName) {
 485           warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
 486           next FILELIST;
 487         }
 488       }
 489       if ($OptionsInfo{FPOutput}) {
 490         if (-e $NewFPFileName) {
 491           warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
 492           next FILELIST;
 493         }
 494       }
 495       if ($OptionsInfo{TextOutput}) {
 496         if (-e $NewTextFileName) {
 497           warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
 498           next FILELIST;
 499         }
 500       }
 501     }
 502 
 503     $SDFilesInfo{FileOkay}[$Index] = 1;
 504 
 505     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 506     $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
 507     $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
 508     $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
 509 
 510     $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
 511     $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
 512   }
 513 }
 514 
 515 # Process option values...
 516 sub ProcessOptions {
 517   %OptionsInfo = ();
 518 
 519   $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
 520 
 521   $OptionsInfo{EStateAtomTypesSetToUse} = $Options{estateatomtypessettouse} ? $Options{estateatomtypessettouse} : 'ArbitrarySize';
 522 
 523   $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
 524   $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
 525   $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
 526 
 527   my(@SpecifiedDataFields);
 528   @SpecifiedDataFields = ();
 529 
 530   @{$OptionsInfo{SpecifiedDataFields}} = ();
 531   $OptionsInfo{CompoundID} = '';
 532 
 533   if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
 534     if ($Options{compoundidmode} =~ /^DataField$/i) {
 535       if (!$Options{compoundid}) {
 536         die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
 537       }
 538       $OptionsInfo{CompoundID} = $Options{compoundid};
 539     }
 540     elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
 541       $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
 542     }
 543   }
 544   elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
 545     if (!$Options{datafields}) {
 546       die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
 547     }
 548     @SpecifiedDataFields = split /\,/, $Options{datafields};
 549     push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
 550   }
 551 
 552   $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'EStateIndiciesFingerprints';
 553 
 554   $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
 555 
 556   if ($Options{fingerprintslabelmode} =~ /^FingerprintsLabelWithIDs$/) {
 557     if ($Options{estateatomtypessettouse} =~ /^FixedSize$/i) {
 558       # Append E-state atom types for non-hydrogen atoms to the fingerprints label...
 559       my($AtomType, @IDs);
 560       @IDs = ();
 561       for $AtomType (@{AtomTypes::EStateAtomTypes::GetAllPossibleEStateNonHydrogenAtomTypes()}) {
 562         push @IDs, "S${AtomType}";
 563       }
 564       $OptionsInfo{FingerprintsLabel} .= "; EStateAtomTypes: " . TextUtil::JoinWords(\@IDs, " ", 0);
 565     }
 566   }
 567   $OptionsInfo{FingerprintsLabelMode} = $Options{fingerprintslabelmode};
 568 
 569   $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
 570 
 571   $OptionsInfo{Output} = $Options{output};
 572   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
 573   $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
 574   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
 575 
 576   $OptionsInfo{OutDelim} = $Options{outdelim};
 577   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 578 
 579   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 580   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 581 
 582   # Precision for E-state indicies...
 583   $OptionsInfo{ValuesPrecision} = $Options{valuesprecision};
 584 
 585   # Setup default vector string format...
 586   my($VectorStringFormat);
 587   $VectorStringFormat = '';
 588   if ($Options{vectorstringformat}) {
 589     $VectorStringFormat = $Options{vectorstringformat};
 590   }
 591   else {
 592     $VectorStringFormat = ($Options{estateatomtypessettouse} =~ /^FixedSize$/) ? "ValuesString" : "IDsAndValuesString";
 593   }
 594   $OptionsInfo{VectorStringFormat} = $VectorStringFormat;
 595 }
 596 
 597 # Setup script usage  and retrieve command line arguments specified using various options...
 598 sub SetupScriptUsage {
 599 
 600   # Retrieve all the options...
 601   %Options = ();
 602 
 603   $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
 604 
 605   $Options{compoundidmode} = 'LabelPrefix';
 606   $Options{compoundidlabel} = 'CompoundID';
 607   $Options{datafieldsmode} = 'CompoundID';
 608 
 609   $Options{filter} = 'Yes';
 610 
 611   $Options{estateatomtypessettouse} = 'ArbitrarySize';
 612 
 613   $Options{fingerprintslabelmode} = 'FingerprintsLabelOnly';
 614   $Options{keeplargestcomponent} = 'Yes';
 615 
 616   $Options{output} = 'text';
 617   $Options{outdelim} = 'comma';
 618   $Options{quote} = 'yes';
 619 
 620   $Options{valuesprecision} = 3;
 621 
 622   $Options{vectorstringformat} = '';
 623 
 624   if (!GetOptions(\%Options, "aromaticitymodel=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "estateatomtypessettouse|e=s", "filter|f=s", "fingerprintslabelmode=s", "fingerprintslabel=s",  "help|h", "keeplargestcomponent|k=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "valuesprecision=s", "vectorstringformat|v=s", "workingdir|w=s")) {
 625     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 626   }
 627   if ($Options{workingdir}) {
 628     if (! -d $Options{workingdir}) {
 629       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 630     }
 631     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 632   }
 633   if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
 634     my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
 635     die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
 636   }
 637   if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
 638     die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
 639   }
 640   if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
 641     die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
 642   }
 643   if ($Options{estateatomtypessettouse} && $Options{estateatomtypessettouse} !~ /^(ArbitrarySize|FixedSize)$/) {
 644     die "Error: The value specified, $Options{estateatomtypessettouse}, for option \"-e, --EStateAtomTypesSetToUse\" is not valid. Allowed values: ArbitrarySize or FixedSize\n";
 645   }
 646   if ($Options{filter} !~ /^(Yes|No)$/i) {
 647     die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
 648   }
 649   if ($Options{fingerprintslabelmode} !~ /^(FingerprintsLabelOnly|FingerprintsLabelWithIDs)$/i) {
 650     die "Error: The value specified, $Options{fingerprintslabelmode}, for option \"--FingerprintsLabelMode\" is not valid. Allowed values: FingerprintsLabelOnly or FingerprintsLabelWithIDs\n";
 651   }
 652   if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
 653     die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
 654   }
 655   if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
 656     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
 657   }
 658   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 659     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 660   }
 661   if ($Options{quote} !~ /^(Yes|No)$/i) {
 662     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 663   }
 664   if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
 665     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
 666   }
 667   if (!IsPositiveInteger($Options{valuesprecision})) {
 668     die "Error: The value specified, $Options{valuesprecision}, for option \"--ValuesPrecision\" is not valid. Allowed values: > 0 \n";
 669   }
 670   if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
 671     die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 672   }
 673 }
 674