MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: TopologicalPharmacophoreAtomTripletsFingerprints.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use TextUtil;
  34 use SDFileUtil;
  35 use MoleculeFileIO;
  36 use FileIO::FingerprintsSDFileIO;
  37 use FileIO::FingerprintsTextFileIO;
  38 use FileIO::FingerprintsFPFileIO;
  39 use AtomTypes::FunctionalClassAtomTypes;
  40 use Fingerprints::TopologicalPharmacophoreAtomTripletsFingerprints;
  41 
  42 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  43 
  44 # Autoflush STDOUT
  45 $| = 1;
  46 
  47 # Starting message...
  48 $ScriptName = basename($0);
  49 print "\n$ScriptName: Starting...\n\n";
  50 $StartTime = new Benchmark;
  51 
  52 # Get the options and setup script...
  53 SetupScriptUsage();
  54 if ($Options{help} || @ARGV < 1) {
  55   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  56 }
  57 
  58 my(@SDFilesList);
  59 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  60 
  61 # Process options...
  62 print "Processing options...\n";
  63 my(%OptionsInfo);
  64 ProcessOptions();
  65 
  66 # Setup information about input files...
  67 print "Checking input SD file(s)...\n";
  68 my(%SDFilesInfo);
  69 RetrieveSDFilesInfo();
  70 
  71 # Process input files..
  72 my($FileIndex);
  73 if (@SDFilesList > 1) {
  74   print "\nProcessing SD files...\n";
  75 }
  76 for $FileIndex (0 .. $#SDFilesList) {
  77   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  78     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  79     GenerateTopologicalPharmacophoreAtomTripletsFingerprints($FileIndex);
  80   }
  81 }
  82 print "\n$ScriptName:Done...\n\n";
  83 
  84 $EndTime = new Benchmark;
  85 $TotalTime = timediff ($EndTime, $StartTime);
  86 print "Total time: ", timestr($TotalTime), "\n";
  87 
  88 ###############################################################################
  89 
  90 # Generate fingerprints for a SD file...
  91 #
  92 sub GenerateTopologicalPharmacophoreAtomTripletsFingerprints {
  93   my($FileIndex) = @_;
  94   my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalPharmacophoreAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, $SetupOutputFiles);
  95 
  96   $SDFile = $SDFilesList[$FileIndex];
  97 
  98   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
  99   $SetupOutputFiles = 1;
 100 
 101   $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
 102   $MoleculeFileIO->Open();
 103 
 104   $CmpdCount = 0;
 105   $IgnoredCmpdCount = 0;
 106 
 107   COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
 108     $CmpdCount++;
 109 
 110     # Filter compound data before calculating fingerprints...
 111     if ($OptionsInfo{Filter}) {
 112       if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
 113         $IgnoredCmpdCount++;
 114         next COMPOUND;
 115       }
 116     }
 117 
 118     $TopologicalPharmacophoreAtomTripletsFingerprints = GenerateMoleculeFingerprints($Molecule);
 119     if (!$TopologicalPharmacophoreAtomTripletsFingerprints) {
 120       $IgnoredCmpdCount++;
 121       ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
 122       next COMPOUND;
 123     }
 124 
 125     if ($SetupOutputFiles) {
 126       $SetupOutputFiles = 0;
 127       SetupFingerprintsLabelValueIDs($TopologicalPharmacophoreAtomTripletsFingerprints);
 128       ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
 129     }
 130 
 131     WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 132   }
 133   $MoleculeFileIO->Close();
 134 
 135   if ($NewFPSDFileIO) {
 136     $NewFPSDFileIO->Close();
 137   }
 138   if ($NewFPTextFileIO) {
 139     $NewFPTextFileIO->Close();
 140   }
 141   if ($NewFPFileIO) {
 142     $NewFPFileIO->Close();
 143   }
 144 
 145   WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
 146 }
 147 
 148 # Process compound being ignored due to problems in fingerprints geneation...
 149 #
 150 sub ProcessIgnoredCompound {
 151   my($Mode, $CmpdCount, $Molecule) = @_;
 152   my($CmpdID, $DataFieldLabelAndValuesRef);
 153 
 154   $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 155   $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 156 
 157   MODE: {
 158     if ($Mode =~ /^ContainsNonElementalData$/i) {
 159       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
 160       next MODE;
 161     }
 162 
 163     if ($Mode =~ /^ContainsNoElementalData$/i) {
 164       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
 165       next MODE;
 166     }
 167 
 168     if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
 169       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 170       next MODE;
 171     }
 172     warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 173   }
 174 }
 175 
 176 # Check and filter compounds....
 177 #
 178 sub CheckAndFilterCompound {
 179   my($CmpdCount, $Molecule) = @_;
 180   my($ElementCount, $NonElementCount);
 181 
 182   ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
 183 
 184   if ($NonElementCount) {
 185     ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
 186     return 1;
 187   }
 188 
 189   if (!$ElementCount) {
 190     ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
 191     return 1;
 192   }
 193 
 194   return 0;
 195 }
 196 
 197 # Write out compounds fingerprints generation summary statistics...
 198 #
 199 sub WriteFingerprintsGenerationSummaryStatistics {
 200   my($CmpdCount, $IgnoredCmpdCount) = @_;
 201   my($ProcessedCmpdCount);
 202 
 203   $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
 204 
 205   print "\nNumber of compounds: $CmpdCount\n";
 206   print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
 207   print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
 208 }
 209 
 210 # Append atom pair value IDs to fingerprint label...
 211 #
 212 sub SetupFingerprintsLabelValueIDs {
 213   my($TopologicalPharmacophoreAtomTripletsFingerprints) = @_;
 214 
 215   if ($OptionsInfo{AtomTripletsSetSizeToUse} =~ /^ArbitrarySize$/i ||
 216       $OptionsInfo{FingerprintsLabelMode} !~ /^FingerprintsLabelWithIDs$/i) {
 217     return;
 218   }
 219   $OptionsInfo{FingerprintsLabel} .= "; Value IDs: " . $TopologicalPharmacophoreAtomTripletsFingerprints->GetFingerprintsVector->GetValueIDsString();
 220 }
 221 
 222 # Open output files...
 223 #
 224 sub SetupAndOpenOutputFiles {
 225   my($FileIndex) = @_;
 226   my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
 227 
 228   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
 229 
 230   # Setup common parameters for fingerprints file IO objects...
 231   #
 232   %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
 233 
 234   if ($OptionsInfo{SDOutput}) {
 235     $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
 236     print "Generating SD file $NewFPSDFile...\n";
 237     $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
 238     $NewFPSDFileIO->Open();
 239   }
 240 
 241   if ($OptionsInfo{FPOutput}) {
 242     $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
 243     print "Generating FP file $NewFPFile...\n";
 244     $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
 245     $NewFPFileIO->Open();
 246   }
 247 
 248   if ($OptionsInfo{TextOutput}) {
 249     my($ColLabelsRef);
 250 
 251     $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
 252     $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
 253 
 254     print "Generating text file $NewFPTextFile...\n";
 255     $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
 256     $NewFPTextFileIO->Open();
 257   }
 258 
 259   return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 260 }
 261 
 262 # Write fingerpritns and other data to appropriate output files...
 263 #
 264 sub WriteDataToOutputFiles {
 265   my($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
 266   my($DataFieldLabelAndValuesRef);
 267 
 268   $DataFieldLabelAndValuesRef = undef;
 269   if ($NewFPTextFileIO || $NewFPFileIO) {
 270     $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 271   }
 272 
 273   if ($NewFPSDFileIO) {
 274     my($CmpdString);
 275 
 276     $CmpdString = $Molecule->GetInputMoleculeString();
 277     $NewFPSDFileIO->WriteFingerprints($TopologicalPharmacophoreAtomTripletsFingerprints, $CmpdString);
 278   }
 279 
 280   if ($NewFPTextFileIO) {
 281     my($ColValuesRef);
 282 
 283     $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 284     $NewFPTextFileIO->WriteFingerprints($TopologicalPharmacophoreAtomTripletsFingerprints, $ColValuesRef);
 285   }
 286 
 287   if ($NewFPFileIO) {
 288     my($CompoundID);
 289 
 290     $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 291     $NewFPFileIO->WriteFingerprints($TopologicalPharmacophoreAtomTripletsFingerprints, $CompoundID);
 292   }
 293 }
 294 
 295 # Generate approriate column labels for FPText output file...
 296 #
 297 sub SetupFPTextFileCoulmnLabels {
 298   my($FileIndex) = @_;
 299   my($Line, @ColLabels);
 300 
 301   @ColLabels = ();
 302   if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 303     push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 304   }
 305   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 306     push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 307   }
 308   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 309     push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
 310   }
 311   elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 312     push @ColLabels, $OptionsInfo{CompoundIDLabel};
 313   }
 314   # Add fingerprints label...
 315   push @ColLabels, $OptionsInfo{FingerprintsLabel};
 316 
 317   return \@ColLabels;
 318 }
 319 
 320 # Generate column values FPText output file..
 321 #
 322 sub SetupFPTextFileCoulmnValues {
 323   my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 324   my(@ColValues);
 325 
 326   @ColValues = ();
 327   if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 328     push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 329   }
 330   elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 331     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 332   }
 333   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 334     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 335   }
 336   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 337     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
 338   }
 339 
 340   return \@ColValues;
 341 }
 342 
 343 # Generate compound ID for FP and FPText output files..
 344 #
 345 sub SetupCmpdIDForOutputFiles {
 346   my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 347   my($CmpdID);
 348 
 349   $CmpdID = '';
 350   if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
 351     my($MolName);
 352     $MolName = $Molecule->GetName();
 353     $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
 354   }
 355   elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
 356     $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
 357   }
 358   elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 359     my($SpecifiedDataField);
 360     $SpecifiedDataField = $OptionsInfo{CompoundID};
 361     $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
 362   }
 363   elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
 364     $CmpdID = $Molecule->GetName();
 365   }
 366   return $CmpdID;
 367 }
 368 
 369 # Generate fingerprints for molecule...
 370 #
 371 sub GenerateMoleculeFingerprints {
 372   my($Molecule) = @_;
 373   my($TopologicalPharmacophoreAtomTripletsFingerprints);
 374 
 375   if ($OptionsInfo{KeepLargestComponent}) {
 376     $Molecule->KeepLargestComponent();
 377   }
 378   if (!$Molecule->DetectRings()) {
 379     return undef;
 380   }
 381   $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
 382   $Molecule->DetectAromaticity();
 383 
 384   $TopologicalPharmacophoreAtomTripletsFingerprints = new Fingerprints::TopologicalPharmacophoreAtomTripletsFingerprints('Molecule' => $Molecule, 'AtomTripletsSetSizeToUse' => $OptionsInfo{AtomTripletsSetSizeToUse}, 'MinDistance' => $OptionsInfo{MinDistance},  'MaxDistance' => $OptionsInfo{MaxDistance}, 'DistanceBinSize' => $OptionsInfo{DistanceBinSize}, 'UseTriangleInequality' => $OptionsInfo{UseTriangleInequality}, 'AtomTypesToUse' => \@{$OptionsInfo{AtomTypesToUse}});
 385 
 386   # Generate fingerprints...
 387   $TopologicalPharmacophoreAtomTripletsFingerprints->GenerateFingerprints();
 388 
 389   # Make sure fingerprints generation is successful...
 390   if (!$TopologicalPharmacophoreAtomTripletsFingerprints->IsFingerprintsGenerationSuccessful()) {
 391     return undef;
 392   }
 393 
 394   return $TopologicalPharmacophoreAtomTripletsFingerprints;
 395 }
 396 
 397 # Retrieve information about SD files...
 398 #
 399 sub RetrieveSDFilesInfo {
 400   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
 401 
 402   %SDFilesInfo = ();
 403   @{$SDFilesInfo{FileOkay}} = ();
 404   @{$SDFilesInfo{OutFileRoot}} = ();
 405   @{$SDFilesInfo{SDOutFileNames}} = ();
 406   @{$SDFilesInfo{FPOutFileNames}} = ();
 407   @{$SDFilesInfo{TextOutFileNames}} = ();
 408   @{$SDFilesInfo{AllDataFieldsRef}} = ();
 409   @{$SDFilesInfo{CommonDataFieldsRef}} = ();
 410 
 411   $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
 412   $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
 413 
 414   FILELIST: for $Index (0 .. $#SDFilesList) {
 415     $SDFile = $SDFilesList[$Index];
 416 
 417     $SDFilesInfo{FileOkay}[$Index] = 0;
 418     $SDFilesInfo{OutFileRoot}[$Index] = '';
 419     $SDFilesInfo{SDOutFileNames}[$Index] = '';
 420     $SDFilesInfo{FPOutFileNames}[$Index] = '';
 421     $SDFilesInfo{TextOutFileNames}[$Index] = '';
 422 
 423     $SDFile = $SDFilesList[$Index];
 424     if (!(-e $SDFile)) {
 425       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 426       next FILELIST;
 427     }
 428     if (!CheckFileType($SDFile, "sd sdf")) {
 429       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 430       next FILELIST;
 431     }
 432 
 433     if ($CheckDataField) {
 434       # Make sure data field exists in SD file..
 435       my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 436 
 437       @CmpdLines = ();
 438       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 439       $CmpdString = ReadCmpdString(\*SDFILE);
 440       close SDFILE;
 441       @CmpdLines = split "\n", $CmpdString;
 442       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 443       $SpecifiedDataField = $OptionsInfo{CompoundID};
 444       if (!exists $DataFieldValues{$SpecifiedDataField}) {
 445         warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using  \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
 446         next FILELIST;
 447       }
 448     }
 449 
 450     $AllDataFieldsRef = '';
 451     $CommonDataFieldsRef = '';
 452     if ($CollectDataFields) {
 453       my($CmpdCount);
 454       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 455       ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 456       close SDFILE;
 457     }
 458 
 459     # Setup output file names...
 460     $FileDir = ""; $FileName = ""; $FileExt = "";
 461     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 462 
 463     $TextOutFileExt = "csv";
 464     if ($Options{outdelim} =~ /^tab$/i) {
 465       $TextOutFileExt = "tsv";
 466     }
 467     $SDOutFileExt = $FileExt;
 468     $FPOutFileExt = "fpf";
 469 
 470     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 471       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 472       if ($RootFileName && $RootFileExt) {
 473         $FileName = $RootFileName;
 474       }
 475       else {
 476         $FileName = $OptionsInfo{OutFileRoot};
 477       }
 478       $OutFileRoot = $FileName;
 479     }
 480     else {
 481       $OutFileRoot = "${FileName}TopologicalPharmacophoreAtomTripletsFP";
 482     }
 483 
 484     $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
 485     $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
 486     $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
 487 
 488     if ($OptionsInfo{SDOutput}) {
 489       if ($SDFile =~ /$NewSDFileName/i) {
 490         warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 491         print "Specify a different name using \"-r --root\" option or use default name.\n";
 492         next FILELIST;
 493       }
 494     }
 495 
 496     if (!$OptionsInfo{OverwriteFiles}) {
 497       # Check SD and text outout files...
 498       if ($OptionsInfo{SDOutput}) {
 499         if (-e $NewSDFileName) {
 500           warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
 501           next FILELIST;
 502         }
 503       }
 504       if ($OptionsInfo{FPOutput}) {
 505         if (-e $NewFPFileName) {
 506           warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
 507           next FILELIST;
 508         }
 509       }
 510       if ($OptionsInfo{TextOutput}) {
 511         if (-e $NewTextFileName) {
 512           warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
 513           next FILELIST;
 514         }
 515       }
 516     }
 517 
 518     $SDFilesInfo{FileOkay}[$Index] = 1;
 519 
 520     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 521     $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
 522     $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
 523     $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
 524 
 525     $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
 526     $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
 527   }
 528 }
 529 
 530 # Process option values...
 531 sub ProcessOptions {
 532   %OptionsInfo = ();
 533 
 534   ProcessAtomTypesToUseOption();
 535 
 536   $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
 537 
 538   $OptionsInfo{AtomTripletsSetSizeToUse} = $Options{atomtripletssetsizetouse};
 539 
 540   $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
 541   $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
 542   $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
 543 
 544   my(@SpecifiedDataFields);
 545   @SpecifiedDataFields = ();
 546 
 547   @{$OptionsInfo{SpecifiedDataFields}} = ();
 548   $OptionsInfo{CompoundID} = '';
 549 
 550   if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
 551     if ($Options{compoundidmode} =~ /^DataField$/i) {
 552       if (!$Options{compoundid}) {
 553         die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
 554       }
 555       $OptionsInfo{CompoundID} = $Options{compoundid};
 556     }
 557     elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
 558       $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
 559     }
 560   }
 561   elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
 562     if (!$Options{datafields}) {
 563       die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
 564     }
 565     @SpecifiedDataFields = split /\,/, $Options{datafields};
 566     push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
 567   }
 568 
 569   $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
 570 
 571   $OptionsInfo{FingerprintsLabelMode} = $Options{fingerprintslabelmode};
 572   $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalPharmacophoreAtomTripletsFingerprints';
 573 
 574   $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
 575 
 576   $OptionsInfo{DistanceBinSize} = $Options{distancebinsize};
 577 
 578   $OptionsInfo{MinDistance} = $Options{mindistance};
 579   $OptionsInfo{MaxDistance} = $Options{maxdistance};
 580 
 581   $OptionsInfo{Output} = $Options{output};
 582   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
 583   $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
 584   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
 585 
 586   $OptionsInfo{OutDelim} = $Options{outdelim};
 587   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 588 
 589   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 590   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 591 
 592   $OptionsInfo{UseTriangleInequality} = ($Options{usetriangleinequality} =~ /^Yes$/i) ? 1 : 0;
 593 
 594   # Setup default vector string format...
 595   my($VectorStringFormat);
 596   $VectorStringFormat = '';
 597 
 598   if ($Options{vectorstringformat}) {
 599     $VectorStringFormat = $Options{vectorstringformat};
 600 
 601     if ($Options{atomtripletssetsizetouse} =~ /^ArbitrarySize$/i && $VectorStringFormat =~ /^ValuesString$/i) {
 602       die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid for $Options{atomtripletssetsizetouse} value of \"--AtomTripletsSetSizeToUse\" option. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 603     }
 604   }
 605   else {
 606     $VectorStringFormat = ($Options{atomtripletssetsizetouse} =~ /^FixedSize$/) ? "ValuesString" : "IDsAndValuesString";
 607   }
 608   $OptionsInfo{VectorStringFormat} = $VectorStringFormat;
 609 }
 610 
 611 # Process atom type to use option...
 612 #
 613 sub ProcessAtomTypesToUseOption {
 614   my($AtomType, $SpecifiedAtomTypesToUse, @AtomTypesWords);
 615 
 616   @{$OptionsInfo{AtomTypesToUse}} = ();
 617   if (IsEmpty($Options{atomtypestouse})) {
 618     die "Error: Atom types value specified using \"-a, --AtomTypesToUse\" option is empty\n";
 619   }
 620 
 621   $SpecifiedAtomTypesToUse = $Options{atomtypestouse};
 622   $SpecifiedAtomTypesToUse =~ s/ //g;
 623   @AtomTypesWords = split /\,/, $SpecifiedAtomTypesToUse;
 624 
 625   for $AtomType (@AtomTypesWords) {
 626     if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($AtomType)) {
 627       die "Error: Atom type specified, $AtomType, using \"-a, --AtomTypesToUse\" option is not valid...\n ";
 628     }
 629     push @{$OptionsInfo{AtomTypesToUse}}, $AtomType;
 630   }
 631 }
 632 
 633 # Setup script usage  and retrieve command line arguments specified using various options...
 634 sub SetupScriptUsage {
 635 
 636   # Retrieve all the options...
 637   %Options = ();
 638 
 639   $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
 640 
 641   $Options{atomtripletssetsizetouse} = 'ArbitrarySize';
 642 
 643   $Options{atomtypestouse} = 'HBD,HBA,PI,NI,H,Ar';
 644 
 645   $Options{compoundidmode} = 'LabelPrefix';
 646   $Options{compoundidlabel} = 'CompoundID';
 647   $Options{datafieldsmode} = 'CompoundID';
 648 
 649   $Options{filter} = 'Yes';
 650 
 651   $Options{fingerprintslabelmode} = 'FingerprintsLabelOnly';
 652 
 653   $Options{keeplargestcomponent} = 'Yes';
 654 
 655   $Options{mindistance} = 1;
 656   $Options{maxdistance} = 10;
 657 
 658   $Options{distancebinsize} = 2;
 659 
 660   $Options{usetriangleinequality} = 'Yes';
 661 
 662   $Options{output} = 'text';
 663   $Options{outdelim} = 'comma';
 664   $Options{quote} = 'yes';
 665 
 666   $Options{vectorstringformat} = '';
 667 
 668   if (!GetOptions(\%Options, "aromaticitymodel=s", "atomtripletssetsizetouse=s", "atomtypestouse|a=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "distancebinsize=s", "filter|f=s", "fingerprintslabelmode=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s",  "mindistance=s", "maxdistance=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "usetriangleinequality|u=s", "vectorstringformat|v=s", "workingdir|w=s")) {
 669     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 670   }
 671   if ($Options{workingdir}) {
 672     if (! -d $Options{workingdir}) {
 673       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 674     }
 675     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 676   }
 677   if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
 678     my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
 679     die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
 680   }
 681   if ($Options{atomtripletssetsizetouse} !~ /^(ArbitrarySize|FixedSize)$/i) {
 682     die "Error: The value specified, $Options{atomtripletssetsizetouse}, for option \"--AtomTripletsSetSizeToUse\" is not valid. Allowed values: ArbitrarySize or FixedSize\n";
 683   }
 684   if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
 685     die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
 686   }
 687   if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
 688     die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
 689   }
 690   if (!IsPositiveInteger($Options{distancebinsize})) {
 691     die "Error: The value specified, $Options{distancebinsize}, for option \"--DistanceBinSize\" is not valid. Allowed values: > 0 \n";
 692   }
 693   if ($Options{filter} !~ /^(Yes|No)$/i) {
 694     die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
 695   }
 696   if ($Options{fingerprintslabelmode} !~ /^(FingerprintsLabelOnly|FingerprintsLabelWithIDs)$/i) {
 697     die "Error: The value specified, $Options{fingerprintslabelmode}, for option \"--FingerprintsLabelMode\" is not valid. Allowed values: FingerprintsLabelOnly or FingerprintsLabelWithIDs\n";
 698   }
 699   if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
 700     die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
 701   }
 702   if (!IsPositiveInteger($Options{mindistance})) {
 703     die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: > 0 \n";
 704   }
 705   if (!IsPositiveInteger($Options{maxdistance})) {
 706     die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n";
 707   }
 708   if ($Options{mindistance} > $Options{maxdistance}) {
 709     die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n";
 710   }
 711   if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
 712     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
 713   }
 714   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 715     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 716   }
 717   if ($Options{quote} !~ /^(Yes|No)$/i) {
 718     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 719   }
 720   if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
 721     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
 722   }
 723   if ($Options{usetriangleinequality} !~ /^(Yes|No)$/i) {
 724     die "Error: The value specified, $Options{usetriangleinequality}, for option \"-u, --UseTriangleInequality\" is not valid. Allowed values: Yes or No\n";
 725   }
 726   if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
 727     die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 728   }
 729 }
 730