MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: TopologicalAtomTripletsFingerprints.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use TextUtil;
  34 use SDFileUtil;
  35 use MoleculeFileIO;
  36 use FileIO::FingerprintsSDFileIO;
  37 use FileIO::FingerprintsTextFileIO;
  38 use FileIO::FingerprintsFPFileIO;
  39 use AtomTypes::AtomicInvariantsAtomTypes;
  40 use AtomTypes::FunctionalClassAtomTypes;
  41 use Fingerprints::TopologicalAtomTripletsFingerprints;
  42 
  43 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  44 
  45 # Autoflush STDOUT
  46 $| = 1;
  47 
  48 # Starting message...
  49 $ScriptName = basename($0);
  50 print "\n$ScriptName: Starting...\n\n";
  51 $StartTime = new Benchmark;
  52 
  53 # Get the options and setup script...
  54 SetupScriptUsage();
  55 if ($Options{help} || @ARGV < 1) {
  56   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  57 }
  58 
  59 my(@SDFilesList);
  60 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  61 
  62 # Process options...
  63 print "Processing options...\n";
  64 my(%OptionsInfo);
  65 ProcessOptions();
  66 
  67 # Setup information about input files...
  68 print "Checking input SD file(s)...\n";
  69 my(%SDFilesInfo);
  70 RetrieveSDFilesInfo();
  71 
  72 # Process input files..
  73 my($FileIndex);
  74 if (@SDFilesList > 1) {
  75   print "\nProcessing SD files...\n";
  76 }
  77 for $FileIndex (0 .. $#SDFilesList) {
  78   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  79     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  80     GenerateTopologicalAtomTripletsFingerprints($FileIndex);
  81   }
  82 }
  83 print "\n$ScriptName:Done...\n\n";
  84 
  85 $EndTime = new Benchmark;
  86 $TotalTime = timediff ($EndTime, $StartTime);
  87 print "Total time: ", timestr($TotalTime), "\n";
  88 
  89 ###############################################################################
  90 
  91 # Generate fingerprints for a SD file...
  92 #
  93 sub GenerateTopologicalAtomTripletsFingerprints {
  94   my($FileIndex) = @_;
  95   my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
  96 
  97   $SDFile = $SDFilesList[$FileIndex];
  98 
  99   # Setup output files...
 100   #
 101   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
 102 
 103   $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
 104   $MoleculeFileIO->Open();
 105 
 106   $CmpdCount = 0;
 107   $IgnoredCmpdCount = 0;
 108 
 109   COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
 110     $CmpdCount++;
 111 
 112     # Filter compound data before calculating fingerprints...
 113     if ($OptionsInfo{Filter}) {
 114       if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
 115         $IgnoredCmpdCount++;
 116         next COMPOUND;
 117       }
 118     }
 119 
 120     $TopologicalAtomTripletsFingerprints = GenerateMoleculeFingerprints($Molecule);
 121     if (!$TopologicalAtomTripletsFingerprints) {
 122       $IgnoredCmpdCount++;
 123       ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
 124       next COMPOUND;
 125     }
 126 
 127     WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 128   }
 129   $MoleculeFileIO->Close();
 130 
 131   if ($NewFPSDFileIO) {
 132     $NewFPSDFileIO->Close();
 133   }
 134   if ($NewFPTextFileIO) {
 135     $NewFPTextFileIO->Close();
 136   }
 137   if ($NewFPFileIO) {
 138     $NewFPFileIO->Close();
 139   }
 140 
 141   WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
 142 }
 143 
 144 # Process compound being ignored due to problems in fingerprints geneation...
 145 #
 146 sub ProcessIgnoredCompound {
 147   my($Mode, $CmpdCount, $Molecule) = @_;
 148   my($CmpdID, $DataFieldLabelAndValuesRef);
 149 
 150   $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 151   $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 152 
 153   MODE: {
 154     if ($Mode =~ /^ContainsNonElementalData$/i) {
 155       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
 156       next MODE;
 157     }
 158 
 159     if ($Mode =~ /^ContainsNoElementalData$/i) {
 160       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
 161       next MODE;
 162     }
 163 
 164     if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
 165       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 166       next MODE;
 167     }
 168     warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 169   }
 170 }
 171 
 172 # Check and filter compounds....
 173 #
 174 sub CheckAndFilterCompound {
 175   my($CmpdCount, $Molecule) = @_;
 176   my($ElementCount, $NonElementCount);
 177 
 178   ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
 179 
 180   if ($NonElementCount) {
 181     ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
 182     return 1;
 183   }
 184 
 185   if (!$ElementCount) {
 186     ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
 187     return 1;
 188   }
 189 
 190   return 0;
 191 }
 192 
 193 # Write out compounds fingerprints generation summary statistics...
 194 #
 195 sub WriteFingerprintsGenerationSummaryStatistics {
 196   my($CmpdCount, $IgnoredCmpdCount) = @_;
 197   my($ProcessedCmpdCount);
 198 
 199   $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
 200 
 201   print "\nNumber of compounds: $CmpdCount\n";
 202   print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
 203   print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
 204 }
 205 
 206 # Open output files...
 207 #
 208 sub SetupAndOpenOutputFiles {
 209   my($FileIndex) = @_;
 210   my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
 211 
 212   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
 213 
 214   # Setup common parameters for fingerprints file IO objects...
 215   #
 216   %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
 217 
 218   if ($OptionsInfo{SDOutput}) {
 219     $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
 220     print "Generating SD file $NewFPSDFile...\n";
 221     $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
 222     $NewFPSDFileIO->Open();
 223   }
 224 
 225   if ($OptionsInfo{FPOutput}) {
 226     $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
 227     print "Generating FP file $NewFPFile...\n";
 228     $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
 229     $NewFPFileIO->Open();
 230   }
 231 
 232   if ($OptionsInfo{TextOutput}) {
 233     my($ColLabelsRef);
 234 
 235     $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
 236     $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
 237 
 238     print "Generating text file $NewFPTextFile...\n";
 239     $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
 240     $NewFPTextFileIO->Open();
 241   }
 242 
 243   return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 244 }
 245 
 246 # Write fingerpritns and other data to appropriate output files...
 247 #
 248 sub WriteDataToOutputFiles {
 249   my($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
 250   my($DataFieldLabelAndValuesRef);
 251 
 252   $DataFieldLabelAndValuesRef = undef;
 253   if ($NewFPTextFileIO || $NewFPFileIO) {
 254     $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 255   }
 256 
 257   if ($NewFPSDFileIO) {
 258     my($CmpdString);
 259 
 260     $CmpdString = $Molecule->GetInputMoleculeString();
 261     $NewFPSDFileIO->WriteFingerprints($TopologicalAtomTripletsFingerprints, $CmpdString);
 262   }
 263 
 264   if ($NewFPTextFileIO) {
 265     my($ColValuesRef);
 266 
 267     $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 268     $NewFPTextFileIO->WriteFingerprints($TopologicalAtomTripletsFingerprints, $ColValuesRef);
 269   }
 270 
 271   if ($NewFPFileIO) {
 272     my($CompoundID);
 273 
 274     $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 275     $NewFPFileIO->WriteFingerprints($TopologicalAtomTripletsFingerprints, $CompoundID);
 276   }
 277 }
 278 
 279 # Generate approriate column labels for FPText output file...
 280 #
 281 sub SetupFPTextFileCoulmnLabels {
 282   my($FileIndex) = @_;
 283   my($Line, @ColLabels);
 284 
 285   @ColLabels = ();
 286   if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 287     push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 288   }
 289   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 290     push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 291   }
 292   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 293     push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
 294   }
 295   elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 296     push @ColLabels, $OptionsInfo{CompoundIDLabel};
 297   }
 298   # Add fingerprints label...
 299   push @ColLabels, $OptionsInfo{FingerprintsLabel};
 300 
 301   return \@ColLabels;
 302 }
 303 
 304 # Generate column values FPText output file..
 305 #
 306 sub SetupFPTextFileCoulmnValues {
 307   my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 308   my(@ColValues);
 309 
 310   @ColValues = ();
 311   if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 312     push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 313   }
 314   elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 315     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 316   }
 317   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 318     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 319   }
 320   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 321     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
 322   }
 323 
 324   return \@ColValues;
 325 }
 326 
 327 # Generate compound ID for FP and FPText output files..
 328 #
 329 sub SetupCmpdIDForOutputFiles {
 330   my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 331   my($CmpdID);
 332 
 333   $CmpdID = '';
 334   if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
 335     my($MolName);
 336     $MolName = $Molecule->GetName();
 337     $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
 338   }
 339   elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
 340     $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
 341   }
 342   elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 343     my($SpecifiedDataField);
 344     $SpecifiedDataField = $OptionsInfo{CompoundID};
 345     $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
 346   }
 347   elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
 348     $CmpdID = $Molecule->GetName();
 349   }
 350   return $CmpdID;
 351 }
 352 
 353 # Generate fingerprints for molecule...
 354 #
 355 sub GenerateMoleculeFingerprints {
 356   my($Molecule) = @_;
 357   my($TopologicalAtomTripletsFingerprints);
 358 
 359   if ($OptionsInfo{KeepLargestComponent}) {
 360     $Molecule->KeepLargestComponent();
 361   }
 362   if (!$Molecule->DetectRings()) {
 363     return undef;
 364   }
 365   $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
 366   $Molecule->DetectAromaticity();
 367 
 368   $TopologicalAtomTripletsFingerprints = new Fingerprints::TopologicalAtomTripletsFingerprints('Molecule' => $Molecule, 'MinDistance' => $OptionsInfo{MinDistance},  'MaxDistance' => $OptionsInfo{MaxDistance}, 'UseTriangleInequality' => $OptionsInfo{UseTriangleInequality}, 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType});
 369   SetAtomIdentifierTypeValuesToUse($TopologicalAtomTripletsFingerprints);
 370 
 371   # Generate fingerprints...
 372   $TopologicalAtomTripletsFingerprints->GenerateFingerprints();
 373 
 374   # Make sure fingerprints generation is successful...
 375   if (!$TopologicalAtomTripletsFingerprints->IsFingerprintsGenerationSuccessful()) {
 376     return undef;
 377   }
 378 
 379   return $TopologicalAtomTripletsFingerprints;
 380 }
 381 
 382 # Set atom identifier type to use for generating fingerprints...
 383 #
 384 sub SetAtomIdentifierTypeValuesToUse {
 385   my($TopologicalAtomTripletsFingerprints) = @_;
 386 
 387   if ($OptionsInfo{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
 388     $TopologicalAtomTripletsFingerprints->SetAtomicInvariantsToUse(\@{$OptionsInfo{AtomicInvariantsToUse}});
 389   }
 390   elsif ($OptionsInfo{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
 391     $TopologicalAtomTripletsFingerprints->SetFunctionalClassesToUse(\@{$OptionsInfo{FunctionalClassesToUse}});
 392   }
 393   elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
 394     # Nothing to do for now...
 395   }
 396   else {
 397     die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
 398   }
 399 }
 400 
 401 # Retrieve information about SD files...
 402 #
 403 sub RetrieveSDFilesInfo {
 404   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
 405 
 406   %SDFilesInfo = ();
 407   @{$SDFilesInfo{FileOkay}} = ();
 408   @{$SDFilesInfo{OutFileRoot}} = ();
 409   @{$SDFilesInfo{SDOutFileNames}} = ();
 410   @{$SDFilesInfo{FPOutFileNames}} = ();
 411   @{$SDFilesInfo{TextOutFileNames}} = ();
 412   @{$SDFilesInfo{AllDataFieldsRef}} = ();
 413   @{$SDFilesInfo{CommonDataFieldsRef}} = ();
 414 
 415   $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
 416   $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
 417 
 418   FILELIST: for $Index (0 .. $#SDFilesList) {
 419     $SDFile = $SDFilesList[$Index];
 420 
 421     $SDFilesInfo{FileOkay}[$Index] = 0;
 422     $SDFilesInfo{OutFileRoot}[$Index] = '';
 423     $SDFilesInfo{SDOutFileNames}[$Index] = '';
 424     $SDFilesInfo{FPOutFileNames}[$Index] = '';
 425     $SDFilesInfo{TextOutFileNames}[$Index] = '';
 426 
 427     $SDFile = $SDFilesList[$Index];
 428     if (!(-e $SDFile)) {
 429       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 430       next FILELIST;
 431     }
 432     if (!CheckFileType($SDFile, "sd sdf")) {
 433       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 434       next FILELIST;
 435     }
 436 
 437     if ($CheckDataField) {
 438       # Make sure data field exists in SD file..
 439       my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 440 
 441       @CmpdLines = ();
 442       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 443       $CmpdString = ReadCmpdString(\*SDFILE);
 444       close SDFILE;
 445       @CmpdLines = split "\n", $CmpdString;
 446       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 447       $SpecifiedDataField = $OptionsInfo{CompoundID};
 448       if (!exists $DataFieldValues{$SpecifiedDataField}) {
 449         warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using  \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
 450         next FILELIST;
 451       }
 452     }
 453 
 454     $AllDataFieldsRef = '';
 455     $CommonDataFieldsRef = '';
 456     if ($CollectDataFields) {
 457       my($CmpdCount);
 458       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 459       ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 460       close SDFILE;
 461     }
 462 
 463     # Setup output file names...
 464     $FileDir = ""; $FileName = ""; $FileExt = "";
 465     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 466 
 467     $TextOutFileExt = "csv";
 468     if ($Options{outdelim} =~ /^tab$/i) {
 469       $TextOutFileExt = "tsv";
 470     }
 471     $SDOutFileExt = $FileExt;
 472     $FPOutFileExt = "fpf";
 473 
 474     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 475       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 476       if ($RootFileName && $RootFileExt) {
 477         $FileName = $RootFileName;
 478       }
 479       else {
 480         $FileName = $OptionsInfo{OutFileRoot};
 481       }
 482       $OutFileRoot = $FileName;
 483     }
 484     else {
 485       $OutFileRoot = "${FileName}TopologicalAtomTripletsFP";
 486     }
 487 
 488     $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
 489     $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
 490     $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
 491 
 492     if ($OptionsInfo{SDOutput}) {
 493       if ($SDFile =~ /$NewSDFileName/i) {
 494         warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 495         print "Specify a different name using \"-r --root\" option or use default name.\n";
 496         next FILELIST;
 497       }
 498     }
 499 
 500     if (!$OptionsInfo{OverwriteFiles}) {
 501       # Check SD and text outout files...
 502       if ($OptionsInfo{SDOutput}) {
 503         if (-e $NewSDFileName) {
 504           warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
 505           next FILELIST;
 506         }
 507       }
 508       if ($OptionsInfo{FPOutput}) {
 509         if (-e $NewFPFileName) {
 510           warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
 511           next FILELIST;
 512         }
 513       }
 514       if ($OptionsInfo{TextOutput}) {
 515         if (-e $NewTextFileName) {
 516           warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
 517           next FILELIST;
 518         }
 519       }
 520     }
 521 
 522     $SDFilesInfo{FileOkay}[$Index] = 1;
 523 
 524     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 525     $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
 526     $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
 527     $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
 528 
 529     $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
 530     $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
 531   }
 532 }
 533 
 534 # Process option values...
 535 sub ProcessOptions {
 536   %OptionsInfo = ();
 537 
 538   ProcessAtomIdentifierTypeOptions();
 539 
 540   $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
 541 
 542   $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
 543   $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
 544   $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
 545 
 546   my(@SpecifiedDataFields);
 547   @SpecifiedDataFields = ();
 548 
 549   @{$OptionsInfo{SpecifiedDataFields}} = ();
 550   $OptionsInfo{CompoundID} = '';
 551 
 552   if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
 553     if ($Options{compoundidmode} =~ /^DataField$/i) {
 554       if (!$Options{compoundid}) {
 555         die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
 556       }
 557       $OptionsInfo{CompoundID} = $Options{compoundid};
 558     }
 559     elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
 560       $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
 561     }
 562   }
 563   elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
 564     if (!$Options{datafields}) {
 565       die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
 566     }
 567     @SpecifiedDataFields = split /\,/, $Options{datafields};
 568     push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
 569   }
 570 
 571   $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
 572 
 573   $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalAtomTripletsFingerprints';
 574 
 575   $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
 576 
 577   $OptionsInfo{MinDistance} = $Options{mindistance};
 578   $OptionsInfo{MaxDistance} = $Options{maxdistance};
 579 
 580   $OptionsInfo{Output} = $Options{output};
 581   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
 582   $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
 583   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
 584 
 585   $OptionsInfo{OutDelim} = $Options{outdelim};
 586   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 587 
 588   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 589   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 590 
 591   $OptionsInfo{UseTriangleInequality} = ($Options{usetriangleinequality} =~ /^Yes$/i) ? 1 : 0;
 592 
 593   $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat};
 594 }
 595 
 596 # Process atom identifier type and related options...
 597 #
 598 sub ProcessAtomIdentifierTypeOptions {
 599 
 600   $OptionsInfo{AtomIdentifierType} = $Options{atomidentifiertype};
 601 
 602   if ($Options{atomidentifiertype} =~ /^AtomicInvariantsAtomTypes$/i) {
 603     ProcessAtomicInvariantsToUseOption();
 604   }
 605   elsif ($Options{atomidentifiertype} =~ /^FunctionalClassAtomTypes$/i) {
 606     ProcessFunctionalClassesToUse();
 607   }
 608   elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
 609     # Nothing to do for now...
 610   }
 611   else {
 612     die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
 613   }
 614 }
 615 
 616 # Process specified atomic invariants to use...
 617 #
 618 sub ProcessAtomicInvariantsToUseOption {
 619   my($AtomicInvariant, $AtomSymbolSpecified, @AtomicInvariantsWords);
 620 
 621   @{$OptionsInfo{AtomicInvariantsToUse}} = ();
 622   if (IsEmpty($Options{atomicinvariantstouse})) {
 623     die "Error: Atomic invariants value specified using \"--AtomicInvariantsToUse\" option is empty\n";
 624   }
 625   $AtomSymbolSpecified = 0;
 626   @AtomicInvariantsWords = split /\,/, $Options{atomicinvariantstouse};
 627   for $AtomicInvariant (@AtomicInvariantsWords) {
 628     if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($AtomicInvariant)) {
 629       die "Error: Atomic invariant specified, $AtomicInvariant, using \"--AtomicInvariantsToUse\" option is not valid...\n ";
 630     }
 631     if ($AtomicInvariant =~ /^(AS|AtomSymbol)$/i) {
 632       $AtomSymbolSpecified = 1;
 633     }
 634     push @{$OptionsInfo{AtomicInvariantsToUse}}, $AtomicInvariant;
 635   }
 636   if (!$AtomSymbolSpecified) {
 637     die "Error: Atomic invariant, AS or AtomSymbol, must be specified as using \"--AtomicInvariantsToUse\" option...\n ";
 638   }
 639 }
 640 
 641 # Process specified functional classes invariants to use...
 642 #
 643 sub ProcessFunctionalClassesToUse {
 644   my($FunctionalClass, @FunctionalClassesToUseWords);
 645 
 646   @{$OptionsInfo{FunctionalClassesToUse}} = ();
 647   if (IsEmpty($Options{functionalclassestouse})) {
 648     die "Error: Functional classes value specified using \"--FunctionalClassesToUse\" option is empty\n";
 649   }
 650   @FunctionalClassesToUseWords = split /\,/, $Options{functionalclassestouse};
 651   for $FunctionalClass (@FunctionalClassesToUseWords) {
 652     if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($FunctionalClass)) {
 653       die "Error: Functional class specified, $FunctionalClass, using \"--FunctionalClassesToUse\" option is not valid...\n ";
 654     }
 655     push @{$OptionsInfo{FunctionalClassesToUse}}, $FunctionalClass;
 656   }
 657 }
 658 
 659 # Setup script usage  and retrieve command line arguments specified using various options...
 660 sub SetupScriptUsage {
 661 
 662   # Retrieve all the options...
 663   %Options = ();
 664 
 665   $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
 666 
 667   $Options{atomidentifiertype} = 'AtomicInvariantsAtomTypes';
 668   $Options{atomicinvariantstouse} = 'AS,X,BO,H,FC';
 669 
 670   $Options{functionalclassestouse} = 'HBD,HBA,PI,NI,Ar,Hal';
 671 
 672   $Options{compoundidmode} = 'LabelPrefix';
 673   $Options{compoundidlabel} = 'CompoundID';
 674   $Options{datafieldsmode} = 'CompoundID';
 675 
 676   $Options{filter} = 'Yes';
 677 
 678   $Options{keeplargestcomponent} = 'Yes';
 679 
 680   $Options{mindistance} = 1;
 681   $Options{maxdistance} = 10;
 682 
 683   $Options{output} = 'text';
 684   $Options{outdelim} = 'comma';
 685   $Options{quote} = 'yes';
 686 
 687   $Options{usetriangleinequality} = 'No';
 688 
 689   $Options{vectorstringformat} = 'IDsAndValuesString';
 690 
 691   if (!GetOptions(\%Options, "aromaticitymodel=s", "atomidentifiertype|a=s", "atomicinvariantstouse=s", "functionalclassestouse=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s",  "help|h", "keeplargestcomponent|k=s",  "mindistance=s", "maxdistance=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", , "usetriangleinequality|u=s", "vectorstringformat|v=s", "workingdir|w=s")) {
 692     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 693   }
 694   if ($Options{workingdir}) {
 695     if (! -d $Options{workingdir}) {
 696       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 697     }
 698     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 699   }
 700   if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
 701     my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
 702     die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
 703   }
 704   if ($Options{atomidentifiertype} !~ /^(AtomicInvariantsAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|FunctionalClassAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
 705     die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
 706   }
 707   if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
 708     die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
 709   }
 710   if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
 711     die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
 712   }
 713   if ($Options{filter} !~ /^(Yes|No)$/i) {
 714     die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
 715   }
 716   if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
 717     die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
 718   }
 719   if (!IsPositiveInteger($Options{mindistance})) {
 720     die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: > 0 \n";
 721   }
 722   if (!IsPositiveInteger($Options{maxdistance})) {
 723     die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n";
 724   }
 725   if ($Options{mindistance} > $Options{maxdistance}) {
 726     die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n";
 727   }
 728   if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
 729     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
 730   }
 731   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 732     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 733   }
 734   if ($Options{quote} !~ /^(Yes|No)$/i) {
 735     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 736   }
 737   if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
 738     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
 739   }
 740   if ($Options{usetriangleinequality} !~ /^(Yes|No)$/i) {
 741     die "Error: The value specified, $Options{usetriangleinequality}, for option \"-u, --UseTriangleInequality\" is not valid. Allowed values: Yes or No\n";
 742   }
 743   if ($Options{vectorstringformat} !~ /^(IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
 744     die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 745   }
 746 }
 747