MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: TopologicalPharmacophoreAtomPairsFingerprints.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use TextUtil;
  34 use SDFileUtil;
  35 use MoleculeFileIO;
  36 use FileIO::FingerprintsSDFileIO;
  37 use FileIO::FingerprintsTextFileIO;
  38 use FileIO::FingerprintsFPFileIO;
  39 use AtomTypes::FunctionalClassAtomTypes;
  40 use Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints;
  41 
  42 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  43 
  44 # Autoflush STDOUT
  45 $| = 1;
  46 
  47 # Starting message...
  48 $ScriptName = basename($0);
  49 print "\n$ScriptName: Starting...\n\n";
  50 $StartTime = new Benchmark;
  51 
  52 # Get the options and setup script...
  53 SetupScriptUsage();
  54 if ($Options{help} || @ARGV < 1) {
  55   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  56 }
  57 
  58 my(@SDFilesList);
  59 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  60 
  61 # Process options...
  62 print "Processing options...\n";
  63 my(%OptionsInfo);
  64 ProcessOptions();
  65 
  66 # Setup information about input files...
  67 print "Checking input SD file(s)...\n";
  68 my(%SDFilesInfo);
  69 RetrieveSDFilesInfo();
  70 
  71 # Process input files..
  72 my($FileIndex);
  73 if (@SDFilesList > 1) {
  74   print "\nProcessing SD files...\n";
  75 }
  76 for $FileIndex (0 .. $#SDFilesList) {
  77   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  78     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  79     GenerateTopologicalPharmacophoreAtomPairsFingerprints($FileIndex);
  80   }
  81 }
  82 print "\n$ScriptName:Done...\n\n";
  83 
  84 $EndTime = new Benchmark;
  85 $TotalTime = timediff ($EndTime, $StartTime);
  86 print "Total time: ", timestr($TotalTime), "\n";
  87 
  88 ###############################################################################
  89 
  90 # Generate fingerprints for a SD file...
  91 #
  92 sub GenerateTopologicalPharmacophoreAtomPairsFingerprints {
  93   my($FileIndex) = @_;
  94   my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, $SetupOutputFiles);
  95 
  96   $SDFile = $SDFilesList[$FileIndex];
  97 
  98   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
  99   $SetupOutputFiles = 1;
 100 
 101   $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
 102   $MoleculeFileIO->Open();
 103 
 104   $CmpdCount = 0;
 105   $IgnoredCmpdCount = 0;
 106 
 107   COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
 108     $CmpdCount++;
 109 
 110     # Filter compound data before calculating fingerprints...
 111     if ($OptionsInfo{Filter}) {
 112       if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
 113         $IgnoredCmpdCount++;
 114         next COMPOUND;
 115       }
 116     }
 117 
 118     $TopologicalPharmacophoreAtomPairsFingerprints = GenerateMoleculeFingerprints($Molecule);
 119     if (!$TopologicalPharmacophoreAtomPairsFingerprints) {
 120       $IgnoredCmpdCount++;
 121       ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
 122       next COMPOUND;
 123     }
 124 
 125     if ($SetupOutputFiles) {
 126       $SetupOutputFiles = 0;
 127       SetupFingerprintsLabelValueIDs($TopologicalPharmacophoreAtomPairsFingerprints);
 128       ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
 129     }
 130 
 131     WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 132   }
 133   $MoleculeFileIO->Close();
 134 
 135   if ($NewFPSDFileIO) {
 136     $NewFPSDFileIO->Close();
 137   }
 138   if ($NewFPTextFileIO) {
 139     $NewFPTextFileIO->Close();
 140   }
 141   if ($NewFPFileIO) {
 142     $NewFPFileIO->Close();
 143   }
 144 
 145   WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
 146 }
 147 
 148 # Process compound being ignored due to problems in fingerprints geneation...
 149 #
 150 sub ProcessIgnoredCompound {
 151   my($Mode, $CmpdCount, $Molecule) = @_;
 152   my($CmpdID, $DataFieldLabelAndValuesRef);
 153 
 154   $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 155   $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 156 
 157   MODE: {
 158     if ($Mode =~ /^ContainsNonElementalData$/i) {
 159       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
 160       next MODE;
 161     }
 162 
 163     if ($Mode =~ /^ContainsNoElementalData$/i) {
 164       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
 165       next MODE;
 166     }
 167 
 168     if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
 169       warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 170       next MODE;
 171     }
 172     warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
 173   }
 174 }
 175 
 176 # Check and filter compounds....
 177 #
 178 sub CheckAndFilterCompound {
 179   my($CmpdCount, $Molecule) = @_;
 180   my($ElementCount, $NonElementCount);
 181 
 182   ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
 183 
 184   if ($NonElementCount) {
 185     ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
 186     return 1;
 187   }
 188 
 189   if (!$ElementCount) {
 190     ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
 191     return 1;
 192   }
 193 
 194   return 0;
 195 }
 196 
 197 # Write out compounds fingerprints generation summary statistics...
 198 #
 199 sub WriteFingerprintsGenerationSummaryStatistics {
 200   my($CmpdCount, $IgnoredCmpdCount) = @_;
 201   my($ProcessedCmpdCount);
 202 
 203   $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
 204 
 205   print "\nNumber of compounds: $CmpdCount\n";
 206   print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
 207   print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
 208 }
 209 
 210 # Append atom pair value IDs to fingerprint label...
 211 #
 212 sub SetupFingerprintsLabelValueIDs {
 213   my($TopologicalPharmacophoreAtomPairsFingerprints) = @_;
 214 
 215   if ($OptionsInfo{AtomPairsSetSizeToUse} =~ /^ArbitrarySize$/i ||
 216       $OptionsInfo{FingerprintsLabelMode} !~ /^FingerprintsLabelWithIDs$/i) {
 217     return;
 218   }
 219 
 220   $OptionsInfo{FingerprintsLabel} .= "; Value IDs: " . $TopologicalPharmacophoreAtomPairsFingerprints->GetFingerprintsVector->GetValueIDsString();
 221 }
 222 
 223 # Open output files...
 224 #
 225 sub SetupAndOpenOutputFiles {
 226   my($FileIndex) = @_;
 227   my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
 228 
 229   ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
 230 
 231   # Setup common parameters for fingerprints file IO objects...
 232   #
 233   %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
 234 
 235   if ($OptionsInfo{SDOutput}) {
 236     $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
 237     print "Generating SD file $NewFPSDFile...\n";
 238     $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
 239     $NewFPSDFileIO->Open();
 240   }
 241 
 242   if ($OptionsInfo{FPOutput}) {
 243     $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
 244     print "Generating FP file $NewFPFile...\n";
 245     $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
 246     $NewFPFileIO->Open();
 247   }
 248 
 249   if ($OptionsInfo{TextOutput}) {
 250     my($ColLabelsRef);
 251 
 252     $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
 253     $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
 254 
 255     print "Generating text file $NewFPTextFile...\n";
 256     $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
 257     $NewFPTextFileIO->Open();
 258   }
 259 
 260   return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
 261 }
 262 
 263 # Write fingerpritns and other data to appropriate output files...
 264 #
 265 sub WriteDataToOutputFiles {
 266   my($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
 267   my($DataFieldLabelAndValuesRef);
 268 
 269   $DataFieldLabelAndValuesRef = undef;
 270   if ($NewFPTextFileIO || $NewFPFileIO) {
 271     $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
 272   }
 273 
 274   if ($NewFPSDFileIO) {
 275     my($CmpdString);
 276 
 277     $CmpdString = $Molecule->GetInputMoleculeString();
 278     $NewFPSDFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $CmpdString);
 279   }
 280 
 281   if ($NewFPTextFileIO) {
 282     my($ColValuesRef);
 283 
 284     $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 285     $NewFPTextFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $ColValuesRef);
 286   }
 287 
 288   if ($NewFPFileIO) {
 289     my($CompoundID);
 290 
 291     $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 292     $NewFPFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $CompoundID);
 293   }
 294 }
 295 
 296 # Generate approriate column labels for FPText output file...
 297 #
 298 sub SetupFPTextFileCoulmnLabels {
 299   my($FileIndex) = @_;
 300   my($Line, @ColLabels);
 301 
 302   @ColLabels = ();
 303   if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 304     push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 305   }
 306   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 307     push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 308   }
 309   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 310     push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
 311   }
 312   elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 313     push @ColLabels, $OptionsInfo{CompoundIDLabel};
 314   }
 315   # Add fingerprints label...
 316   push @ColLabels, $OptionsInfo{FingerprintsLabel};
 317 
 318   return \@ColLabels;
 319 }
 320 
 321 # Generate column values FPText output file..
 322 #
 323 sub SetupFPTextFileCoulmnValues {
 324   my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 325   my(@ColValues);
 326 
 327   @ColValues = ();
 328   if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
 329     push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
 330   }
 331   elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
 332     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
 333   }
 334   elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
 335     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
 336   }
 337   elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
 338     @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
 339   }
 340 
 341   return \@ColValues;
 342 }
 343 
 344 # Generate compound ID for FP and FPText output files..
 345 #
 346 sub SetupCmpdIDForOutputFiles {
 347   my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
 348   my($CmpdID);
 349 
 350   $CmpdID = '';
 351   if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
 352     my($MolName);
 353     $MolName = $Molecule->GetName();
 354     $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
 355   }
 356   elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
 357     $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
 358   }
 359   elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
 360     my($SpecifiedDataField);
 361     $SpecifiedDataField = $OptionsInfo{CompoundID};
 362     $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
 363   }
 364   elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
 365     $CmpdID = $Molecule->GetName();
 366   }
 367   return $CmpdID;
 368 }
 369 
 370 # Generate fingerprints for molecule...
 371 #
 372 sub GenerateMoleculeFingerprints {
 373   my($Molecule) = @_;
 374   my($TopologicalPharmacophoreAtomPairsFingerprints);
 375 
 376   if ($OptionsInfo{KeepLargestComponent}) {
 377     $Molecule->KeepLargestComponent();
 378   }
 379   if (!$Molecule->DetectRings()) {
 380     return undef;
 381   }
 382   $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
 383   $Molecule->DetectAromaticity();
 384 
 385   if ($OptionsInfo{FuzzifyAtomPairsCount}) {
 386     $TopologicalPharmacophoreAtomPairsFingerprints = new Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints('Molecule' => $Molecule, 'AtomPairsSetSizeToUse' => $OptionsInfo{AtomPairsSetSizeToUse}, 'MinDistance' => $OptionsInfo{MinDistance},  'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomTypesToUse' => \@{$OptionsInfo{AtomTypesToUse}}, , 'NormalizationMethodology' => $OptionsInfo{NormalizationMethodology}, , 'ValuesPrecision' => $OptionsInfo{ValuesPrecision}, 'FuzzifyAtomPairsCount' => $OptionsInfo{FuzzifyAtomPairsCount}, 'FuzzificationMode' =>  $OptionsInfo{FuzzificationMode}, 'FuzzificationMethodology' => $OptionsInfo{FuzzificationMethodology}, 'FuzzFactor' => $OptionsInfo{FuzzFactor});
 387   }
 388   else {
 389     $TopologicalPharmacophoreAtomPairsFingerprints = new Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints('Molecule' => $Molecule, 'AtomPairsSetSizeToUse' => $OptionsInfo{AtomPairsSetSizeToUse}, 'MinDistance' => $OptionsInfo{MinDistance},  'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomTypesToUse' => \@{$OptionsInfo{AtomTypesToUse}}, 'NormalizationMethodology' => $OptionsInfo{NormalizationMethodology}, 'ValuesPrecision' => $OptionsInfo{ValuesPrecision});
 390   }
 391 
 392   # Set atom types weights...
 393   if ($OptionsInfo{UseAtomTypesWeight}) {
 394     $TopologicalPharmacophoreAtomPairsFingerprints->SetAtomTypesWeight(%{$OptionsInfo{AtomTypesWeight}});
 395   }
 396 
 397   # Generate fingerprints...
 398   $TopologicalPharmacophoreAtomPairsFingerprints->GenerateFingerprints();
 399 
 400   # Make sure fingerprints generation is successful...
 401   if (!$TopologicalPharmacophoreAtomPairsFingerprints->IsFingerprintsGenerationSuccessful()) {
 402     return undef;
 403   }
 404 
 405   return $TopologicalPharmacophoreAtomPairsFingerprints;
 406 }
 407 
 408 # Retrieve information about SD files...
 409 #
 410 sub RetrieveSDFilesInfo {
 411   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
 412 
 413   %SDFilesInfo = ();
 414   @{$SDFilesInfo{FileOkay}} = ();
 415   @{$SDFilesInfo{OutFileRoot}} = ();
 416   @{$SDFilesInfo{SDOutFileNames}} = ();
 417   @{$SDFilesInfo{FPOutFileNames}} = ();
 418   @{$SDFilesInfo{TextOutFileNames}} = ();
 419   @{$SDFilesInfo{AllDataFieldsRef}} = ();
 420   @{$SDFilesInfo{CommonDataFieldsRef}} = ();
 421 
 422   $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
 423   $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
 424 
 425   FILELIST: for $Index (0 .. $#SDFilesList) {
 426     $SDFile = $SDFilesList[$Index];
 427 
 428     $SDFilesInfo{FileOkay}[$Index] = 0;
 429     $SDFilesInfo{OutFileRoot}[$Index] = '';
 430     $SDFilesInfo{SDOutFileNames}[$Index] = '';
 431     $SDFilesInfo{FPOutFileNames}[$Index] = '';
 432     $SDFilesInfo{TextOutFileNames}[$Index] = '';
 433 
 434     $SDFile = $SDFilesList[$Index];
 435     if (!(-e $SDFile)) {
 436       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 437       next FILELIST;
 438     }
 439     if (!CheckFileType($SDFile, "sd sdf")) {
 440       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 441       next FILELIST;
 442     }
 443 
 444     if ($CheckDataField) {
 445       # Make sure data field exists in SD file..
 446       my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
 447 
 448       @CmpdLines = ();
 449       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 450       $CmpdString = ReadCmpdString(\*SDFILE);
 451       close SDFILE;
 452       @CmpdLines = split "\n", $CmpdString;
 453       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 454       $SpecifiedDataField = $OptionsInfo{CompoundID};
 455       if (!exists $DataFieldValues{$SpecifiedDataField}) {
 456         warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using  \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
 457         next FILELIST;
 458       }
 459     }
 460 
 461     $AllDataFieldsRef = '';
 462     $CommonDataFieldsRef = '';
 463     if ($CollectDataFields) {
 464       my($CmpdCount);
 465       open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
 466       ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 467       close SDFILE;
 468     }
 469 
 470     # Setup output file names...
 471     $FileDir = ""; $FileName = ""; $FileExt = "";
 472     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 473 
 474     $TextOutFileExt = "csv";
 475     if ($Options{outdelim} =~ /^tab$/i) {
 476       $TextOutFileExt = "tsv";
 477     }
 478     $SDOutFileExt = $FileExt;
 479     $FPOutFileExt = "fpf";
 480 
 481     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 482       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 483       if ($RootFileName && $RootFileExt) {
 484         $FileName = $RootFileName;
 485       }
 486       else {
 487         $FileName = $OptionsInfo{OutFileRoot};
 488       }
 489       $OutFileRoot = $FileName;
 490     }
 491     else {
 492       $OutFileRoot = "${FileName}TopologicalPharmacophoreAtomPairsFP";
 493     }
 494 
 495     $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
 496     $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
 497     $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
 498 
 499     if ($OptionsInfo{SDOutput}) {
 500       if ($SDFile =~ /$NewSDFileName/i) {
 501         warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 502         print "Specify a different name using \"-r --root\" option or use default name.\n";
 503         next FILELIST;
 504       }
 505     }
 506 
 507     if (!$OptionsInfo{OverwriteFiles}) {
 508       # Check SD and text outout files...
 509       if ($OptionsInfo{SDOutput}) {
 510         if (-e $NewSDFileName) {
 511           warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
 512           next FILELIST;
 513         }
 514       }
 515       if ($OptionsInfo{FPOutput}) {
 516         if (-e $NewFPFileName) {
 517           warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
 518           next FILELIST;
 519         }
 520       }
 521       if ($OptionsInfo{TextOutput}) {
 522         if (-e $NewTextFileName) {
 523           warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
 524           next FILELIST;
 525         }
 526       }
 527     }
 528 
 529     $SDFilesInfo{FileOkay}[$Index] = 1;
 530 
 531     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 532     $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
 533     $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
 534     $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
 535 
 536     $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
 537     $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
 538   }
 539 }
 540 
 541 # Process option values...
 542 sub ProcessOptions {
 543   %OptionsInfo = ();
 544 
 545   ProcessAtomTypesToUseOption();
 546   ProcessAtomTypesWeightOption();
 547 
 548   $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
 549 
 550   $OptionsInfo{AtomPairsSetSizeToUse} = $Options{atompairssetsizetouse};
 551 
 552   $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
 553   $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
 554   $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
 555 
 556   my(@SpecifiedDataFields);
 557   @SpecifiedDataFields = ();
 558 
 559   @{$OptionsInfo{SpecifiedDataFields}} = ();
 560   $OptionsInfo{CompoundID} = '';
 561 
 562   if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
 563     if ($Options{compoundidmode} =~ /^DataField$/i) {
 564       if (!$Options{compoundid}) {
 565         die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
 566       }
 567       $OptionsInfo{CompoundID} = $Options{compoundid};
 568     }
 569     elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
 570       $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
 571     }
 572   }
 573   elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
 574     if (!$Options{datafields}) {
 575       die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
 576     }
 577     @SpecifiedDataFields = split /\,/, $Options{datafields};
 578     push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
 579   }
 580 
 581   $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
 582 
 583   $OptionsInfo{FingerprintsLabelMode} = $Options{fingerprintslabelmode};
 584   $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalPharmacophoreAtomPairsFingerprints';
 585 
 586   $OptionsInfo{FuzzifyAtomPairsCount} = ($Options{fuzzifyatompairscount} =~ /^Yes$/i) ? 1 : 0;
 587   $OptionsInfo{FuzzificationMode} = $Options{fuzzificationmode};
 588   $OptionsInfo{FuzzificationMethodology} = $Options{fuzzificationmethodology};
 589   $OptionsInfo{FuzzFactor} = $Options{fuzzfactor};
 590 
 591   $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
 592 
 593   $OptionsInfo{MinDistance} = $Options{mindistance};
 594   $OptionsInfo{MaxDistance} = $Options{maxdistance};
 595 
 596   $OptionsInfo{NormalizationMethodology} = $Options{normalizationmethodology};
 597 
 598   $OptionsInfo{Output} = $Options{output};
 599   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
 600   $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
 601   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
 602 
 603   $OptionsInfo{OutDelim} = $Options{outdelim};
 604   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
 605 
 606   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 607   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 608 
 609   $OptionsInfo{ValuesPrecision} = $Options{valuesprecision};
 610 
 611   # Setup default vector string format...
 612   my($VectorStringFormat);
 613   $VectorStringFormat = '';
 614 
 615   if ($Options{vectorstringformat}) {
 616     $VectorStringFormat = $Options{vectorstringformat};
 617 
 618     if ($Options{atompairssetsizetouse} =~ /^ArbitrarySize$/i && $VectorStringFormat =~ /^ValuesString$/i) {
 619       die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid for $Options{atompairssetsizetouse} value of \"--AtomPairsSetSizeToUse\" option. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 620     }
 621   }
 622   else {
 623     $VectorStringFormat = ($Options{atompairssetsizetouse} =~ /^FixedSize$/) ? "ValuesString" : "IDsAndValuesString";
 624   }
 625   $OptionsInfo{VectorStringFormat} = $VectorStringFormat;
 626 }
 627 
 628 # Process atom type to use option...
 629 #
 630 sub ProcessAtomTypesToUseOption {
 631   my($AtomType, $SpecifiedAtomTypesToUse, @AtomTypesWords);
 632 
 633   @{$OptionsInfo{AtomTypesToUse}} = ();
 634   if (IsEmpty($Options{atomtypestouse})) {
 635     die "Error: Atom types value specified using \"-a, --AtomTypesToUse\" option is empty\n";
 636   }
 637 
 638   $SpecifiedAtomTypesToUse = $Options{atomtypestouse};
 639   $SpecifiedAtomTypesToUse =~ s/ //g;
 640   @AtomTypesWords = split /\,/, $SpecifiedAtomTypesToUse;
 641 
 642   for $AtomType (@AtomTypesWords) {
 643     if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($AtomType)) {
 644       die "Error: Atomic type specified, $AtomType, using \"-a, --AtomTypesToUse\" option is not valid...\n ";
 645     }
 646     push @{$OptionsInfo{AtomTypesToUse}}, $AtomType;
 647   }
 648 }
 649 
 650 # Process atom types weight option...
 651 #
 652 sub ProcessAtomTypesWeightOption {
 653   my($Index, $AtomType, $AtomTypeWeight, $SpecifiedAtomTypesWeight, @AtomTypesWeightsPairs);
 654 
 655   %{$OptionsInfo{AtomTypesWeight}} = ();
 656 
 657   if (IsEmpty($Options{atomtypesweight})) {
 658     die "Error: Atom types weight value specified using \"--AtomTypesWeight\" option is empty\n";
 659   }
 660   $OptionsInfo{UseAtomTypesWeight} = ($Options{atomtypesweight} =~ /^None$/i) ? 0 : 1;
 661   if (!$OptionsInfo{UseAtomTypesWeight}) {
 662     return;
 663   }
 664 
 665   # Process specified atom type/weight pairs...
 666   $SpecifiedAtomTypesWeight = $Options{atomtypesweight};
 667   $SpecifiedAtomTypesWeight =~ s/ //g;
 668   @AtomTypesWeightsPairs = split /\,/, $SpecifiedAtomTypesWeight;
 669 
 670   if (@AtomTypesWeightsPairs % 2) {
 671     die "Error: Invalid number of values specified using \"--AtomTypesWeight\" option: It must contain even number of values.\n";
 672   }
 673 
 674   for ($Index = 0; $Index < @AtomTypesWeightsPairs; $Index += 2) {
 675     $AtomType = $AtomTypesWeightsPairs[$Index]; $AtomTypeWeight = $AtomTypesWeightsPairs[$Index + 1];
 676     if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($AtomType)) {
 677       die "Error: Atom type specified, $AtomType, using \"--AtomTypesWeight\" option is not valid\n ";
 678     }
 679     if (!(IsFloat($AtomTypeWeight) && $AtomTypeWeight >= 0)) {
 680       die "Error: Atom type weight specified, $AtomTypeWeight, using option \"--AtomTypesWeight\" is not valid. Allowed values: real numbers >= 0 \n";
 681     }
 682     $OptionsInfo{AtomTypesWeight}{$AtomType} = $AtomTypeWeight;
 683   }
 684 }
 685 
 686 # Setup script usage  and retrieve command line arguments specified using various options...
 687 sub SetupScriptUsage {
 688 
 689   # Retrieve all the options...
 690   %Options = ();
 691 
 692   $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
 693 
 694   $Options{atompairssetsizetouse} = 'ArbitrarySize';
 695 
 696   $Options{atomtypestouse} = 'HBD,HBA,PI,NI,H';
 697   $Options{atomtypesweight} = 'None';
 698 
 699   $Options{compoundidmode} = 'LabelPrefix';
 700   $Options{compoundidlabel} = 'CompoundID';
 701   $Options{datafieldsmode} = 'CompoundID';
 702 
 703   $Options{filter} = 'Yes';
 704 
 705   $Options{fingerprintslabelmode} = 'FingerprintsLabelOnly';
 706 
 707   $Options{fuzzifyatompairscount} = 'No';
 708   $Options{fuzzificationmode} = 'AfterNormalization';
 709   $Options{fuzzificationmethodology} = 'FuzzyBinning';
 710   $Options{fuzzfactor} = 0.15;
 711 
 712   $Options{keeplargestcomponent} = 'Yes';
 713 
 714   $Options{mindistance} = 1;
 715   $Options{maxdistance} = 10;
 716 
 717   $Options{normalizationmethodology} = 'None';
 718 
 719   $Options{output} = 'text';
 720   $Options{outdelim} = 'comma';
 721   $Options{quote} = 'yes';
 722 
 723   $Options{valuesprecision} = 2;
 724 
 725   $Options{vectorstringformat} = '';
 726 
 727   if (!GetOptions(\%Options, "aromaticitymodel=s", "atompairssetsizetouse=s", "atomtypestouse|a=s", "atomtypesweight=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabelmode=s", "fingerprintslabel=s", "fuzzifyatompairscount=s", "fuzzificationmode=s", "fuzzificationmethodology=s", "fuzzfactor=s", "help|h", "keeplargestcomponent|k=s",  "mindistance=s", "maxdistance=s", "normalizationmethodology|n=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "valuesprecision=s", "vectorstringformat|v=s", "workingdir|w=s")) {
 728     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 729   }
 730   if ($Options{workingdir}) {
 731     if (! -d $Options{workingdir}) {
 732       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 733     }
 734     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 735   }
 736   if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
 737     my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
 738     die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
 739   }
 740   if ($Options{atompairssetsizetouse} !~ /^(ArbitrarySize|FixedSize)$/i) {
 741     die "Error: The value specified, $Options{atompairssetsizetouse}, for option \"--AtomPairsSetSizeToUse\" is not valid. Allowed values: ArbitrarySize or FixedSize\n";
 742   }
 743   if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
 744     die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
 745   }
 746   if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
 747     die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
 748   }
 749   if ($Options{filter} !~ /^(Yes|No)$/i) {
 750     die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
 751   }
 752   if ($Options{fingerprintslabelmode} !~ /^(FingerprintsLabelOnly|FingerprintsLabelWithIDs)$/i) {
 753     die "Error: The value specified, $Options{fingerprintslabelmode}, for option \"--FingerprintsLabelMode\" is not valid. Allowed values: FingerprintsLabelOnly or FingerprintsLabelWithIDs\n";
 754   }
 755   if ($Options{fuzzifyatompairscount} !~ /^(Yes|No)$/i) {
 756     die "Error: The value specified, $Options{fuzzifyatompairscount}, for option \"--FuzzifyAtomPairsCount\" is not valid. Allowed values: Yes or No\n";
 757   }
 758   if ($Options{fuzzificationmode} !~ /^(BeforeNormalization|AfterNormalization)$/i) {
 759     die "Error: The value specified, $Options{fuzzificationmode}, for option \"--FuzzificationMode\" is not valid. Allowed values: BeforeNormalization or AfterNormalization\n";
 760   }
 761   if ($Options{fuzzificationmethodology} !~ /^(FuzzyBinning|FuzzyBinSmoothing)$/i) {
 762     die "Error: The value specified, $Options{fuzzificationmethodology}, for option \"--FuzzificationMethodology\" is not valid. Allowed values: FuzzyBinning or FuzzyBinSmoothing\n";
 763   }
 764   if (!IsFloat($Options{fuzzfactor})) {
 765     die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" is not valid. Allowed values: real numbers >= 0 \n";
 766   }
 767   if ($Options{fuzzificationmethodology} !~ /^FuzzyBinning$/i) {
 768     if (!($Options{fuzzfactor} >=0 && $Options{fuzzfactor} <= 1.0)) {
 769       die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" during FuzzyBinning \"--FuzzificationMethodology\" is not valid. Allowed values: >= 0 and <= 1 \n";
 770     }
 771   }
 772   elsif ($Options{fuzzificationmethodology} !~ /^FuzzyBinSmoothing$/i) {
 773     if (!($Options{fuzzfactor} >=0 && $Options{fuzzfactor} <= 0.5)) {
 774       die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" during FuzzyBinSmoothing \"--FuzzificationMethodology\" is not valid. Allowed values: >= 0 and <= 0.5 \n";
 775     }
 776   }
 777   if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
 778     die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
 779   }
 780   if (!IsInteger($Options{mindistance})) {
 781     die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: >= 0 \n";
 782   }
 783   if (!IsPositiveInteger($Options{maxdistance})) {
 784     die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n";
 785   }
 786   if ($Options{mindistance} > $Options{maxdistance}) {
 787     die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n";
 788   }
 789   if ($Options{normalizationmethodology} !~ /^(None|ByHeavyAtomsCount|ByAtomTypesCount)$/i) {
 790     die "Error: The value specified, $Options{normalizationmethodology}, for option \"--NormalizationMethodology\" is not valid. Allowed values: None, ByHeavyAtomsCount, or ByAtomTypesCount\n";
 791   }
 792   if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
 793     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
 794   }
 795   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 796     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 797   }
 798   if ($Options{quote} !~ /^(Yes|No)$/i) {
 799     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
 800   }
 801   if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
 802     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
 803   }
 804   if (!IsPositiveInteger($Options{valuesprecision})) {
 805     die "Error: The value specified, $Options{valuesprecision}, for option \"--ValuesPrecision\" is not valid. Allowed values: > 0 \n";
 806   }
 807   if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
 808     die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
 809   }
 810 }
 811