MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: SimilaritySearchingFingerprints.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use TextUtil;
  34 use SDFileUtil;
  35 use StatisticsUtil;
  36 use PseudoHeap;
  37 use Fingerprints::FingerprintsFileUtil;
  38 use Fingerprints::FingerprintsBitVector;
  39 use Fingerprints::FingerprintsVector;
  40 
  41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  42 
  43 # Autoflush STDOUT
  44 $| = 1;
  45 
  46 # Starting message...
  47 $ScriptName = basename($0);
  48 print "\n$ScriptName: Starting...\n\n";
  49 $StartTime = new Benchmark;
  50 
  51 # Get the options and setup script...
  52 SetupScriptUsage();
  53 if ($Options{help} || @ARGV != 2) {
  54   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  55 }
  56 
  57 # Process reference and database file names...
  58 my(@FingerprintsFilesList);
  59 ProcessFingerprintsFileNames();
  60 
  61 # Process options...
  62 print "Processing options...\n";
  63 my(%OptionsInfo);
  64 ProcessOptions();
  65 
  66 # Setup information about fingerprints inut and SD/text output files...
  67 my(%FingerprintsFilesInfo, %OutputFilesInfo, %SimilaritySearchInfo);
  68 print "Checking and retrieving information from reference and database fingerprints files...\n";
  69 RetrieveFingerprintsFilesInfo();
  70 
  71 # Perform similarity search...
  72 print "Performing similarity search...\n";
  73 my(%SimilaritySearchResults, %DatabaseFingerprintsFileData);
  74 PerformSimilaritySearch();
  75 
  76 print "\n$ScriptName:Done...\n\n";
  77 
  78 $EndTime = new Benchmark;
  79 $TotalTime = timediff ($EndTime, $StartTime);
  80 print "Total time: ", timestr($TotalTime), "\n";
  81 
  82 ###############################################################################
  83 
  84 # Perform similarity search using fingerprints data in reference and database text files...
  85 #
  86 sub PerformSimilaritySearch {
  87 
  88   print "\nProcessing fingerprints data for reference molecules...\n";
  89   ReadReferenceFingerprintsData();
  90 
  91   InitializeSimilaritySearchResults();
  92   GenerateSimilaritySearchResults();
  93   WriteSimilaritySearchResultFiles();
  94 }
  95 
  96 # Find similar molecules from database molecules for individual or multiple reference molecules...
  97 #
  98 sub GenerateSimilaritySearchResults {
  99   my($DatabaseFingerprintsFileIO, $FingerprintsCount, $IgnoredFingerprintsCount, $DatabaseFingerprintsObject, $DatabaseCmpdID, $ReferenceFingerprintsObject, $ReferenceIndex, $ReferenceCmpdID, $ComparisonValue, $FusedComparisonValue, @ComparisonValues);
 100 
 101   print "Processing fingerprints data for database molecules...\n";
 102 
 103   ($FingerprintsCount, $IgnoredFingerprintsCount) = (0) x 3;
 104 
 105   $DatabaseFingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{Database}{FingerprintsFileIOParameters}});
 106   $DatabaseFingerprintsFileIO->Open();
 107 
 108   @ComparisonValues = ();
 109 
 110   DATABASEFP: while ($DatabaseFingerprintsFileIO->Read()) {
 111     $FingerprintsCount++;
 112 
 113     if (!$DatabaseFingerprintsFileIO->IsFingerprintsDataValid()) {
 114       $IgnoredFingerprintsCount++;
 115       next DATABASEFP;
 116     }
 117     $DatabaseFingerprintsObject = $DatabaseFingerprintsFileIO->GetFingerprints();
 118     $DatabaseCmpdID = $DatabaseFingerprintsFileIO->GetCompoundID();
 119 
 120     if ($SimilaritySearchInfo{MultipleReferencesMode}) {
 121       @ComparisonValues = ();
 122     }
 123 
 124     REFERENCEFP: for $ReferenceIndex (0 .. $#{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) {
 125       $ReferenceCmpdID = $SimilaritySearchInfo{ReferenceCmpdIDsRef}->[$ReferenceIndex];
 126       $ReferenceFingerprintsObject = $SimilaritySearchInfo{ReferenceFingerprintsObjectsRef}->[$ReferenceIndex];
 127 
 128       $ComparisonValue = CompareReferenceAndDatabaseFingerprintsPair($ReferenceFingerprintsObject, $DatabaseFingerprintsObject);
 129       if (!defined $ComparisonValue) {
 130         next REFERENCEFP;
 131       }
 132 
 133       if ($SimilaritySearchInfo{IndividualReferenceMode}) {
 134         CollectSimilaritySearchResults($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $ComparisonValue, $ReferenceCmpdID);
 135       }
 136       elsif ($SimilaritySearchInfo{MultipleReferencesMode}) {
 137         push @ComparisonValues, $ComparisonValue;
 138       }
 139     }
 140 
 141     if ($SimilaritySearchInfo{MultipleReferencesMode}) {
 142       $FusedComparisonValue = CalculateGroupFusionComparisonValue(\@ComparisonValues);
 143       if (!defined $FusedComparisonValue) {
 144         next DATABASEFP;
 145       }
 146       CollectSimilaritySearchResults($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $FusedComparisonValue);
 147     }
 148   }
 149   $DatabaseFingerprintsFileIO->Close();
 150 
 151   print "Number of fingerprints data entries in database fingerprints file: $FingerprintsCount\n";
 152   print "Number of fingerprints date entries processed successfully: ", ($FingerprintsCount - $IgnoredFingerprintsCount)  , "\n";
 153   print "Number of fingerprints data entries ignored due to missing/invalid data: $IgnoredFingerprintsCount\n\n";
 154 }
 155 
 156 # Compare a pair of reference and database fingerprints objects corresponding to bit-vector or
 157 # vectors using specified comparison method and comparison cutoff...
 158 #
 159 sub CompareReferenceAndDatabaseFingerprintsPair {
 160   my($ReferenceFingerprintsObject, $DatabaseFingerprintsObject) = @_;
 161   my($ComparisonMethod, $ComparisonValue);
 162 
 163   $ComparisonMethod = $SimilaritySearchInfo{ComparisonMethod};
 164   $ComparisonValue = $ReferenceFingerprintsObject->$ComparisonMethod($DatabaseFingerprintsObject, @{$SimilaritySearchInfo{ComparisonMethodParameters}});
 165 
 166   if (!defined $ComparisonValue) {
 167     warn "Warning: Ignoring fingerprints data for reference compound ID ",  $ReferenceFingerprintsObject->GetID(), ": Its comparison with database compound ID, ", $DatabaseFingerprintsObject->GetID(), ", failed.\n";
 168     return undef;
 169   }
 170 
 171   $ComparisonValue = sprintf("%.$OptionsInfo{Precision}f", $ComparisonValue);
 172 
 173   # Apply any comparison cutoff...
 174   if ($SimilaritySearchInfo{ApplyComparisonCutoff}) {
 175     return $SimilaritySearchInfo{KeepTop} ? ($ComparisonValue >= $SimilaritySearchInfo{ComparisonCutoff} ? $ComparisonValue : undef) : ($ComparisonValue <= $SimilaritySearchInfo{ComparisonCutoff} ? $ComparisonValue : undef);
 176   }
 177   else {
 178     return $ComparisonValue;
 179   }
 180 }
 181 
 182 # Calculate group fusion comparison value...
 183 #
 184 sub CalculateGroupFusionComparisonValue {
 185   my($ComparisonValuesRef) = @_;
 186   my($FusedComparisonValue, @ComparisonValues);
 187 
 188   if (!@{$ComparisonValuesRef}) {
 189     return undef;
 190   }
 191 
 192   if ($SimilaritySearchInfo{SortComparisonValues}) {
 193     @ComparisonValues = sort { $SimilaritySearchInfo{KeepTop} ? ($b <=> $a) : ($a <=> $b) } @{$ComparisonValuesRef};
 194     if ($SimilaritySearchInfo{UsekNN} && ($OptionsInfo{kNN} < scalar @{$ComparisonValuesRef})) {
 195       # Keep only top kNN values for group fusion...
 196       splice @ComparisonValues, $OptionsInfo{kNN};
 197     }
 198     $ComparisonValuesRef = \@ComparisonValues;
 199   }
 200 
 201   $FusedComparisonValue = &{$SimilaritySearchInfo{GroupFusionMethodRef}}($ComparisonValuesRef);
 202   if ($SimilaritySearchInfo{ApplyPrecisionDuringFusion}) {
 203     $FusedComparisonValue = sprintf("%.$OptionsInfo{Precision}f", $FusedComparisonValue);
 204   }
 205 
 206   return $FusedComparisonValue;
 207 }
 208 
 209 # Collect similarity results for individual reference and multiple references search...
 210 #
 211 sub CollectSimilaritySearchResults {
 212   my($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $ComparisonValue, $ReferenceCmpdID) = @_;
 213 
 214   if (defined $ReferenceCmpdID) {
 215     $SimilaritySearchResults{$ReferenceCmpdID}->AddKeyValuePair($ComparisonValue, $DatabaseCmpdID);
 216   }
 217   else {
 218     $SimilaritySearchResults{ResultsPseudoHeap}->AddKeyValuePair($ComparisonValue, $DatabaseCmpdID);
 219   }
 220 
 221   if ($FingerprintsFilesInfo{Database}{CollectInputFileData}) {
 222     CollectDatabaseFileData($DatabaseCmpdID, $DatabaseFingerprintsFileIO);
 223   }
 224 }
 225 
 226 # Initialize similarity results for individual or multiple reference molecules...
 227 #
 228 sub InitializeSimilaritySearchResults {
 229   my($ReferenceCmpdID);
 230 
 231   %SimilaritySearchResults = ();
 232 
 233   if ($SimilaritySearchInfo{IndividualReferenceMode}) {
 234     for $ReferenceCmpdID (@{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) {
 235       $SimilaritySearchResults{$ReferenceCmpdID} = new PseudoHeap('Type' => ($SimilaritySearchInfo{KeepTop} ? 'KeepTopN' : 'KeepBottomN'), 'KeyType' => 'Numeric', 'MaxSize' => $OptionsInfo{MaxSimilarMolecules});
 236     }
 237   }
 238   elsif ($SimilaritySearchInfo{MultipleReferencesMode}) {
 239     $SimilaritySearchResults{ResultsPseudoHeap} = new PseudoHeap('Type' => ($SimilaritySearchInfo{KeepTop} ? 'KeepTopN' : 'KeepBottomN'), 'KeyType' => 'Numeric', 'MaxSize' => $OptionsInfo{MaxSimilarMolecules});
 240   }
 241 
 242   %DatabaseFingerprintsFileData = ();
 243 }
 244 
 245 # Write out results SD and/or CSV/TSV text files for individual or multiple reference molecules...
 246 #
 247 sub WriteSimilaritySearchResultFiles {
 248   my($NewSDFileRef, $NewTextFileRef, $ReferenceCmpdID, $DatabaseCmpdID, $ComparisonValue);
 249 
 250   ($NewSDFileRef, $NewTextFileRef) = SetupAndOpenOutputFiles();
 251 
 252   if ($SimilaritySearchInfo{IndividualReferenceMode}) {
 253     for $ReferenceCmpdID (@{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) {
 254       for $ComparisonValue ($SimilaritySearchResults{$ReferenceCmpdID}->GetSortedKeys()) {
 255         for $DatabaseCmpdID ($SimilaritySearchResults{$ReferenceCmpdID}->GetKeyValues($ComparisonValue)) {
 256           WriteDataToOutputFiles($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID, $ReferenceCmpdID);
 257         }
 258       }
 259     }
 260   }
 261   elsif ($SimilaritySearchInfo{MultipleReferencesMode}) {
 262     for $ComparisonValue ($SimilaritySearchResults{ResultsPseudoHeap}->GetSortedKeys()) {
 263       for $DatabaseCmpdID ($SimilaritySearchResults{ResultsPseudoHeap}->GetKeyValues($ComparisonValue)) {
 264         WriteDataToOutputFiles($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID);
 265       }
 266     }
 267   }
 268 
 269   if ($NewSDFileRef) {
 270     close $NewSDFileRef;
 271   }
 272   if ($NewTextFileRef) {
 273     close $NewTextFileRef;
 274   }
 275 }
 276 
 277 # Write individual reference or multiple references similarity results along with any other data to output files...
 278 #
 279 sub WriteDataToOutputFiles {
 280   my($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID, $ReferenceCmpdID) = @_;
 281 
 282   if ($NewSDFileRef) {
 283     WriteMolStringDataToSDOutputFile($DatabaseCmpdID, $NewSDFileRef);
 284     if (defined $ReferenceCmpdID) {
 285       print $NewSDFileRef  ">  <ReferenceCmpdID>\n$ReferenceCmpdID\n\n";
 286     }
 287     print $NewSDFileRef  ">  <DatabaseCmpdID>\n$DatabaseCmpdID\n\n>  <ComparisonValue>\n$ComparisonValue\n\n";
 288     WriteDatabaseDataToSDOutputFile($DatabaseCmpdID, $NewSDFileRef);
 289     print $NewSDFileRef "\$\$\$\$\n";
 290   }
 291 
 292   if ($NewTextFileRef) {
 293     my(@LineWords);
 294 
 295     @LineWords = ();
 296     if (defined $ReferenceCmpdID) {
 297       push @LineWords, $ReferenceCmpdID;
 298     }
 299     push @LineWords, ($DatabaseCmpdID, $ComparisonValue);
 300 
 301     if ($FingerprintsFilesInfo{Database}{OutputDataFields} || $FingerprintsFilesInfo{Database}{OutputDataCols}) {
 302       push @LineWords, RetrieveDatabaseDataForTextOutputFile($DatabaseCmpdID);
 303     }
 304     print $NewTextFileRef JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}), "\n";
 305   }
 306 }
 307 
 308 # Open output files...
 309 #
 310 sub SetupAndOpenOutputFiles {
 311   my($NewSDFileRef, $NewTextFileRef, $NewSDFile, $NewTextFile);
 312 
 313   ($NewSDFileRef, $NewTextFileRef) = (undef) x 2;
 314 
 315   if ($OptionsInfo{SDOutput}) {
 316     $NewSDFile = $OutputFilesInfo{SDOutFileName};
 317     print "Generating SD file $NewSDFile...\n";
 318     open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
 319     $NewSDFileRef = \*NEWSDFILE;
 320   }
 321 
 322   if ($OptionsInfo{TextOutput}) {
 323     $NewTextFile = $OutputFilesInfo{TextOutFileName};
 324     print "Generating text file $NewTextFile...\n";
 325     open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n";
 326     $NewTextFileRef = \*NEWTEXTFILE;
 327 
 328     WriteTextFileCoulmnLabels(\*NEWTEXTFILE);
 329   }
 330 
 331   return ($NewSDFileRef, $NewTextFileRef);
 332 }
 333 
 334 # Write out approriate column labels to text file...
 335 #
 336 sub WriteTextFileCoulmnLabels {
 337   my($NewTextFileRef) = @_;
 338   my($Line, @LineWords);
 339 
 340   @LineWords = ();
 341 
 342   if ($SimilaritySearchInfo{IndividualReferenceMode}) {
 343     push @LineWords, qw(ReferenceCompoundID DatabaseCompoundID ComparisonValue);
 344   }
 345   elsif ($SimilaritySearchInfo{MultipleReferencesMode}) {
 346     push @LineWords, qw(DatabaseCompoundID ComparisonValue);
 347   }
 348 
 349   # Add columns for other database fingerprints file data to be written to output file...
 350   if ($FingerprintsFilesInfo{Database}{OutputDataFields}) {
 351     push @LineWords, @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}};
 352   }
 353   elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) {
 354     push @LineWords, @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}};
 355   }
 356 
 357   $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 358   print $NewTextFileRef "$Line\n";
 359 }
 360 
 361 # Write molecule string data to SD output file...
 362 #
 363 sub WriteMolStringDataToSDOutputFile {
 364   my($DatabaseCmpdID, $NewSDFileRef) = @_;
 365 
 366   if ($FingerprintsFilesInfo{Database}{CollectCmpdStringData}) {
 367     my($MolString);
 368 
 369     ($MolString) = split /M  END/, $DatabaseFingerprintsFileData{$DatabaseCmpdID};
 370     print $NewSDFileRef "$MolString\nM  END\n";
 371   }
 372   else {
 373     # Just write out an empty molecule data string...
 374     print $NewSDFileRef SDFileUtil::GenerateEmptyCtabBlockLines(), "\n";
 375   }
 376 }
 377 
 378 # Write database data from SD or Text database file to SD output file...
 379 #
 380 sub WriteDatabaseDataToSDOutputFile {
 381   my($DatabaseCmpdID, $NewSDFileRef) = @_;
 382 
 383   if ($FingerprintsFilesInfo{Database}{OutputDataFields}) {
 384     my($DataFieldLabel, $DataFieldValue, @CmpdLines, %DataFieldLabelAndValues);
 385 
 386     @CmpdLines = split /\n/, $DatabaseFingerprintsFileData{$DatabaseCmpdID};
 387     %DataFieldLabelAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 388 
 389     for $DataFieldLabel ($FingerprintsFilesInfo{Database}{OutputCurrentDataFields} ? GetCmpdDataHeaderLabels(\@CmpdLines) : @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}}) {
 390       $DataFieldValue = exists $DataFieldLabelAndValues{$DataFieldLabel} ? $DataFieldLabelAndValues{$DataFieldLabel} : '';
 391       print $NewSDFileRef  ">  <$DataFieldLabel>\n$DataFieldValue\n\n";
 392     }
 393   }
 394   elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) {
 395     my($DataColNum, $DataFieldLabel, $DataFieldValue);
 396 
 397     for $DataColNum (@{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}}) {
 398       $DataFieldLabel = $FingerprintsFilesInfo{Database}{DataColNumToLabelMap}{$DataColNum};
 399       $DataFieldValue =  $DatabaseFingerprintsFileData{$DatabaseCmpdID}->[$DataColNum];
 400       print $NewSDFileRef  ">  <$DataFieldLabel>\n$DataFieldValue\n\n";
 401     }
 402   }
 403 }
 404 
 405 # Retriebe database data from SD or Text database file for text output file...
 406 #
 407 sub RetrieveDatabaseDataForTextOutputFile {
 408   my($DatabaseCmpdID) = @_;
 409 
 410   if ($FingerprintsFilesInfo{Database}{OutputDataFields}) {
 411     my(@CmpdLines, %DataFieldLabelAndValues);
 412 
 413     @CmpdLines = split /\n/, $DatabaseFingerprintsFileData{$DatabaseCmpdID};
 414     %DataFieldLabelAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 415 
 416     return map { exists $DataFieldLabelAndValues{$_} ? $DataFieldLabelAndValues{$_} : ''} @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}};
 417   }
 418   elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) {
 419     if (exists $DatabaseFingerprintsFileData{$DatabaseCmpdID}) {
 420       return map { $DatabaseFingerprintsFileData{$DatabaseCmpdID}->[$_] } (0 .. $#{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}});
 421     }
 422     else {
 423       return ('') x $#{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}};
 424     }
 425   }
 426 }
 427 
 428 # Collect database file SD compound string or CSV/TSV data line for generating results
 429 # files..
 430 #
 431 sub CollectDatabaseFileData {
 432   my($DatabaseCmpdID, $DatabaseFingerprintsFileIO) = @_;
 433 
 434   if (exists $DatabaseFingerprintsFileData{$DatabaseCmpdID}) {
 435     return;
 436   }
 437 
 438   if ($FingerprintsFilesInfo{Database}{CollectCmpdStringData}) {
 439     $DatabaseFingerprintsFileData{$DatabaseCmpdID} = $DatabaseFingerprintsFileIO->GetCompoundString();
 440   }
 441 
 442   if ($FingerprintsFilesInfo{Database}{CollectDataLine}) {
 443     my(@DataLineWords);
 444     @DataLineWords = $DatabaseFingerprintsFileIO->GetDataLineWords();
 445     $DatabaseFingerprintsFileData{$DatabaseCmpdID} = \@DataLineWords;
 446   }
 447 
 448 }
 449 
 450 # Read fingerprints data from reference fingerprints file...
 451 #
 452 sub ReadReferenceFingerprintsData {
 453   my($FingerprintsFileIO);
 454 
 455   $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{Reference}{FingerprintsFileIOParameters}});
 456   ($SimilaritySearchInfo{ReferenceCmpdIDsRef}, $SimilaritySearchInfo{ReferenceFingerprintsObjectsRef}) = Fingerprints::FingerprintsFileUtil::ReadAndProcessFingerpritsData($FingerprintsFileIO);
 457 
 458 }
 459 
 460 # Retrieve information about fingerprints files...
 461 #
 462 sub RetrieveFingerprintsFilesInfo {
 463 
 464   %FingerprintsFilesInfo = ();
 465   %OutputFilesInfo = ();
 466   %SimilaritySearchInfo = ();
 467 
 468   %{$FingerprintsFilesInfo{Reference}} = ();
 469   %{$FingerprintsFilesInfo{Database}} = ();
 470 
 471   # Set up reference and database file names...
 472   $FingerprintsFilesInfo{Reference}{FileName} = $FingerprintsFilesList[0];
 473   $FingerprintsFilesInfo{Database}{FileName} = $FingerprintsFilesList[1];
 474 
 475   # Retrieve information about reference and database fingerprints file...
 476   RetrieveReferenceFingerprintsFileInfo();
 477   RetrieveDatabaseFingerprintsFileInfo();
 478 
 479   # Setup fingerprints comparison method and associated method parameters...
 480   SetupReferenceAndDatabaseFingerprintsComparisonInfo();
 481 
 482   # Retrieve information for output files...
 483   RetrieveOutputFilesInfo();
 484 }
 485 
 486 # Setup refrerence and database fingerprints comparison method and associated method parameters...
 487 #
 488 sub SetupReferenceAndDatabaseFingerprintsComparisonInfo {
 489 
 490   # Make sure reference and database fingerprints string match...
 491   if (($FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType} !~ /^$FingerprintsFilesInfo{Database}{FirstFingerprintsStringType}$/i) ||
 492      ($FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode} != $FingerprintsFilesInfo{Database}{FingerprintsBitVectorStringMode}) ||
 493      ($FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode} != $FingerprintsFilesInfo{Database}{FingerprintsVectorStringMode}) ) {
 494     die "Error: First reference fingerprints string type, $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType}, must match first database fingerprints type, $FingerprintsFilesInfo{Database}{FirstFingerprintsStringType}.\n";
 495   }
 496 
 497   if ($FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription} !~ /^$FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription}$/i) {
 498     warn "Warning: First reference fingerprints string description, $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription}, doesn't match first database fingerprints string description, $FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription}.\n";
 499   }
 500 
 501   # Setup individual reference and multiple references search mode...
 502   $SimilaritySearchInfo{IndividualReferenceMode} = undef;
 503   $SimilaritySearchInfo{MultipleReferencesMode} = undef;
 504 
 505   if ($OptionsInfo{Mode} =~ /^IndividualReference$/i) {
 506     $SimilaritySearchInfo{IndividualReferenceMode} = 1;
 507   }
 508   elsif ($OptionsInfo{Mode} =~ /^MultipleReferences$/i) {
 509     $SimilaritySearchInfo{MultipleReferencesMode} = 1;
 510   }
 511   else {
 512     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: IndividualReference, MultipleReferences\n";
 513   }
 514 
 515   # Set up reference and database fingerprints similarity search method and paramaters...
 516   my($ComparisonMeasure, $ComparisonMethod, $ApplyComparisonCutoff, $ComparisonCutoff, $KeepTop, @ComparisonMethodParameters);
 517 
 518   $SimilaritySearchInfo{ComparisonMethod} = '';
 519   @{$SimilaritySearchInfo{ComparisonMethodParameters}} = ();
 520 
 521   $SimilaritySearchInfo{ComparisonCutoff} = '';
 522   $SimilaritySearchInfo{KeepTop} = '';
 523 
 524   $ComparisonMeasure = ''; $ComparisonMethod = '';
 525   @ComparisonMethodParameters = ();
 526 
 527   FINGERPRINTSTYPE: {
 528     if ($FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode}) {
 529       $ComparisonMeasure = $OptionsInfo{SpecifiedBitVectorComparisonMeasure};
 530       $ComparisonMethod = $OptionsInfo{SpecifiedBitVectorComparisonMeasureMethod};
 531 
 532       if ($ComparisonMeasure =~ /^TverskySimilarity$/i) {
 533         push @ComparisonMethodParameters, $OptionsInfo{Alpha};
 534       }
 535       elsif ($ComparisonMeasure =~ /^WeightedTverskySimilarity$/i) {
 536         push @ComparisonMethodParameters, $OptionsInfo{Alpha};
 537         push @ComparisonMethodParameters, $OptionsInfo{Beta};
 538       }
 539       elsif ($ComparisonMeasure =~ /^WeightedTanimotoSimilarity$/i) {
 540         push @ComparisonMethodParameters, $OptionsInfo{Beta};
 541       }
 542 
 543       last FINGERPRINTSTYPE;
 544     }
 545     if ($FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode}) {
 546       my($SkipValuesCheck);
 547 
 548       $ComparisonMeasure = $OptionsInfo{SpecifiedVectorComparisonMeasure};
 549       $ComparisonMethod = $OptionsInfo{SpecifiedVectorComparisonMeasuresMethod};
 550 
 551       push @ComparisonMethodParameters, $OptionsInfo{SpecifiedVectorComparisonMode};
 552 
 553       $SkipValuesCheck = $OptionsInfo{Fast} ? 1 : 0;
 554       push @ComparisonMethodParameters, $SkipValuesCheck;
 555 
 556       last FINGERPRINTSTYPE;
 557     }
 558     die "Error: Uknown fingerprints string type. Supported values: FingerprintsBitVectorString or FingerprintsVectorString.\n";
 559   }
 560 
 561   $ApplyComparisonCutoff = $SimilaritySearchInfo{IndividualReferenceMode} ? 1 : (($SimilaritySearchInfo{MultipleReferencesMode} && $OptionsInfo{GroupFusionApplyCutoff}) ? 1 : 0);
 562 
 563   $ComparisonCutoff = ''; $KeepTop = '';
 564   if ($ComparisonMethod =~ /Distance/i) {
 565     $ComparisonCutoff = $OptionsInfo{DistanceCutoff};
 566     $KeepTop = ($OptionsInfo{SearchMode} =~ /^SimilaritySearch$/i) ? 0 : 1;
 567   }
 568   else {
 569     $ComparisonCutoff = $OptionsInfo{SimilarityCutoff};
 570     $KeepTop = ($OptionsInfo{SearchMode} =~ /^SimilaritySearch$/i) ? 1 : 0;
 571   }
 572 
 573   $SimilaritySearchInfo{ComparisonMethod} = $ComparisonMethod;
 574   @{$SimilaritySearchInfo{ComparisonMethodParameters}} = @ComparisonMethodParameters;
 575 
 576   $SimilaritySearchInfo{ComparisonCutoff} = $ComparisonCutoff;
 577   $SimilaritySearchInfo{KeepTop} = $KeepTop;
 578   $SimilaritySearchInfo{ApplyComparisonCutoff} = $ApplyComparisonCutoff;
 579 
 580   # Setup references to group fusion methods...
 581   $SimilaritySearchInfo{GroupFusionMethodRef} = undef;
 582   $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = undef;
 583 
 584   FUSIONRULE: {
 585     if ($OptionsInfo{GroupFusionRule} =~ /^Max$/i) {
 586       # It's always the first value in the appropriated sorted list using value of KeepTop...
 587       $SimilaritySearchInfo{GroupFusionMethodRef} = sub { my($ComparisonValuesRef) = @_; return $ComparisonValuesRef->[0]; };
 588       last FUSIONRULE;
 589     }
 590     if ($OptionsInfo{GroupFusionRule} =~ /^Min$/i) {
 591       # It's always the last value in the appropriated sorted list using value of KeepTop...
 592       $SimilaritySearchInfo{GroupFusionMethodRef} = sub { my($ComparisonValuesRef) = @_; return $ComparisonValuesRef->[$#{$ComparisonValuesRef}]; };
 593       last FUSIONRULE;
 594     }
 595     if ($OptionsInfo{GroupFusionRule} =~ /^Mean$/i) {
 596       $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Mean;
 597       $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1;
 598       last FUSIONRULE;
 599     }
 600     if ($OptionsInfo{GroupFusionRule} =~ /^Median$/i) {
 601       $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Median;
 602       $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1;
 603       last FUSIONRULE;
 604     }
 605     if ($OptionsInfo{GroupFusionRule} =~ /^Sum$/i) {
 606       $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Sum;
 607       $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1;
 608       last FUSIONRULE;
 609     }
 610     if ($OptionsInfo{GroupFusionRule} =~ /^Euclidean$/i) {
 611       $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Euclidean;
 612       $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1;
 613       last FUSIONRULE;
 614     }
 615     die "Error: The value specified, $Options{groupfusionrule}, for option \"-g, --GroupFusionRule\" is not valid. Allowed values: Max, Min, Mean, Median, Sum, Euclidean\n";
 616   }
 617 
 618   $SimilaritySearchInfo{UsekNN} = ($OptionsInfo{kNN} !~ /^All$/i) ? 1 : 0;
 619   $SimilaritySearchInfo{SortComparisonValues} = (($OptionsInfo{GroupFusionRule} =~ /^(Max|Min)$/i) || $SimilaritySearchInfo{UsekNN}) ? 1 : 0;
 620 }
 621 
 622 # Retrieve information about reference fingerprints file...
 623 #
 624 sub RetrieveReferenceFingerprintsFileInfo {
 625   my($FingerprintsFile, $FileType, $InDelim, $FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription);
 626 
 627   $FingerprintsFile = $FingerprintsFilesInfo{Reference}{FileName};
 628   ($FileType, $InDelim) =  RetrieveFingerprintsFileInfo($FingerprintsFile);
 629 
 630   $FingerprintsFilesInfo{Reference}{FileType} = $FileType;
 631   $FingerprintsFilesInfo{Reference}{InDelim} = $InDelim;
 632 
 633   # Setup reference FingerprintsFileIO parameters...
 634   %{$FingerprintsFilesInfo{Reference}{FingerprintsFileIOParameters}} = RetrieveFingerprintsFileIOParameters('Reference', $FileType, $FingerprintsFile);
 635 
 636   # Make sure reference fingerprints data file contains valid and retrieve fingerprints string mode information...
 637   ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription) = RetrieveFingerprintsFileFingerprintsStringInfo('Reference', $FingerprintsFile);
 638   $FingerprintsFilesInfo{Reference}{FingerprintsStringMode} = $FingerprintsStringMode;
 639   $FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode} = $FingerprintsBitVectorStringMode;
 640   $FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode} = $FingerprintsVectorStringMode;
 641   $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType} = $FirstFingerprintsStringType;
 642   $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription} = $FirstFingerprintsStringDescription;
 643 
 644 }
 645 
 646 # Retrieve information about database fingerprints file...
 647 #
 648 sub RetrieveDatabaseFingerprintsFileInfo {
 649   my($FingerprintsFile, $FileType, $InDelim, $FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription);
 650 
 651   $FingerprintsFile = $FingerprintsFilesInfo{Database}{FileName};
 652   ($FileType, $InDelim) =  RetrieveFingerprintsFileInfo($FingerprintsFile);
 653 
 654   $FingerprintsFilesInfo{Database}{FileType} = $FileType;
 655   $FingerprintsFilesInfo{Database}{InDelim} = $InDelim;
 656 
 657   # Setup reference FingerprintsFileIO parameters...
 658   %{$FingerprintsFilesInfo{Database}{FingerprintsFileIOParameters}} = RetrieveFingerprintsFileIOParameters('Database', $FileType, $FingerprintsFile);
 659 
 660   # Make sure database fingerprints data file contains valid and retrieve fingerprints string mode information...
 661   ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription) = RetrieveFingerprintsFileFingerprintsStringInfo('Database', $FingerprintsFile);
 662   $FingerprintsFilesInfo{Database}{FingerprintsStringMode} = $FingerprintsStringMode;
 663   $FingerprintsFilesInfo{Database}{FingerprintsBitVectorStringMode} = $FingerprintsBitVectorStringMode;
 664   $FingerprintsFilesInfo{Database}{FingerprintsVectorStringMode} = $FingerprintsVectorStringMode;
 665   $FingerprintsFilesInfo{Database}{FirstFingerprintsStringType} = $FirstFingerprintsStringType;
 666   $FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription} = $FirstFingerprintsStringDescription;
 667 
 668   # Retrieve database fingerprints data field information for output file...
 669   #
 670   RetrieveDatabaseFingerprintsDataFieldsInfo($FingerprintsFile, $FileType, $InDelim);
 671 
 672   # Retrieve database fingerprints text file data columns information for output file...
 673   #
 674   RetrieveDatabaseFingerprintsDataColsInfo($FingerprintsFile, $FileType, $InDelim);
 675 
 676   # Any need to collect database compound string or data line for generation of results files...
 677   $FingerprintsFilesInfo{Database}{CollectCmpdStringData} = ($FileType =~ /^SD$/i) ? 1 : 0;
 678   $FingerprintsFilesInfo{Database}{CollectDataLine} = ($FileType =~ /^Text$/i && $OptionsInfo{DatabaseDataColsMode} =~ /^(All|Specify)$/i) ? 1 : 0;
 679   $FingerprintsFilesInfo{Database}{CollectInputFileData} = ($FingerprintsFilesInfo{Database}{CollectCmpdStringData} || $FingerprintsFilesInfo{Database}{CollectDataLine}) ? 1 : 0;
 680 
 681   # Set maximum number of similar compounds to find for individual reference of set of multiple
 682   # reference compounds...
 683   #
 684   SetMaximumSimilarMoleculesToRetrieve($FingerprintsFile, $FileType, $InDelim);
 685 }
 686 
 687 # Retrieve database fingerprints data field information...
 688 #
 689 sub RetrieveDatabaseFingerprintsDataFieldsInfo {
 690   my($FingerprintsFile, $FileType, $InDelim) = @_;
 691   my($CollectDataFields, $CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef, @DataFieldsToOutput);
 692 
 693   $FingerprintsFilesInfo{Database}{OutputDataFields} = 0;
 694   @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}} = ();
 695 
 696   $FingerprintsFilesInfo{Database}{OutputCurrentDataFields} = 0;
 697 
 698   @{$FingerprintsFilesInfo{Database}{AllDataFields}} = ();
 699   @{$FingerprintsFilesInfo{Database}{CommonDataFields}} = ();
 700   @{$FingerprintsFilesInfo{Database}{SpecifiedDatabaseDataFields}} = ();
 701 
 702   if ($FileType !~ /^SD$/i) {
 703     return;
 704   }
 705 
 706   # No need to go over SD file and collect data fields for SD file during All DatabaseDataFieldsMode as
 707   # they would be retrieved from database SD file compound string during generation of output files...
 708   #
 709   $CollectDataFields = (($OptionsInfo{TextOutput} && $OptionsInfo{DatabaseDataFieldsMode} =~ /^(All|Common)$/i) || ($OptionsInfo{SDOutput} && $OptionsInfo{DatabaseDataFieldsMode} =~ /^Common$/i)) ? 1 : 0;
 710 
 711   ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = (undef) x 2;
 712 
 713   if ($CollectDataFields) {
 714     open SDFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n";
 715     ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 716     close SDFILE;
 717   }
 718 
 719   @DataFieldsToOutput = ();
 720   if ($OptionsInfo{DatabaseDataFieldsMode} =~ /^All$/i) {
 721     if (defined $AllDataFieldsRef) {
 722       push @DataFieldsToOutput, @{$AllDataFieldsRef};
 723       push @{$FingerprintsFilesInfo{Database}{AllDataFields}}, @{$AllDataFieldsRef};
 724     }
 725     else {
 726       # Retrieve and output data fields and values dynamically...
 727       $FingerprintsFilesInfo{Database}{OutputCurrentDataFields} = 1;
 728     }
 729   }
 730   elsif ($OptionsInfo{DatabaseDataFieldsMode} =~ /^Common$/i) {
 731     if (defined $CommonDataFieldsRef) {
 732       push @DataFieldsToOutput, @{$CommonDataFieldsRef};
 733       push @{$FingerprintsFilesInfo{Database}{CommonDataFields}}, @{$CommonDataFieldsRef};
 734     }
 735   }
 736   elsif ($OptionsInfo{DatabaseDataFieldsMode} =~ /^Specify$/i) {
 737     push @DataFieldsToOutput, @{$OptionsInfo{SpecifiedDatabaseDataFields}};
 738     push @{$FingerprintsFilesInfo{Database}{SpecifiedDatabaseDataFields}}, @{$OptionsInfo{SpecifiedDatabaseDataFields}};
 739   }
 740 
 741   if ($OptionsInfo{DatabaseDataFieldsMode} !~ /^CompoundID$/i) {
 742     $FingerprintsFilesInfo{Database}{OutputDataFields} = 1;
 743   }
 744 
 745   push @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}}, @DataFieldsToOutput;
 746 
 747 }
 748 
 749 # Retrieve database fingerprints data columns information...
 750 #
 751 sub RetrieveDatabaseFingerprintsDataColsInfo {
 752   my($FingerprintsFile, $FileType, $InDelim) = @_;
 753   my($Line, $ColNum, $ColLabel, $NumOfCols, @DataColLabels, @DataColLabelsToOutput, @DataColNumsToOutput, %DataColLabelToNumMap, %DataColNumToLabelMap);
 754 
 755   $FingerprintsFilesInfo{Database}{OutputDataCols} = 0;
 756 
 757   @{$FingerprintsFilesInfo{Database}{DataColLabels}} = ();
 758   %{$FingerprintsFilesInfo{Database}{DataColLabelToNumMap}} = ();
 759   %{$FingerprintsFilesInfo{Database}{DataColNumToLabelMap}} = ();
 760 
 761   @{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}} = ();
 762   @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}} = ();
 763 
 764   if ($FileType !~ /^Text$/i) {
 765     return;
 766   }
 767 
 768   @DataColLabels = ();
 769   @DataColLabelsToOutput = ();
 770   @DataColNumsToOutput = ();
 771 
 772   %DataColLabelToNumMap = ();
 773   %DataColNumToLabelMap = ();
 774 
 775   # Get column label line...
 776   open TEXTFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n";
 777   $Line = TextUtil::GetTextLine(\*TEXTFILE);
 778   close TEXTFILE;
 779 
 780   $InDelim = ($InDelim =~ /^Tab$/i) ? "\t" : ($InDelim =~ /semicolon/i ? "\;" : "\,");
 781 
 782   @DataColLabels = TextUtil::SplitWords($Line, $InDelim);
 783   $NumOfCols = scalar @DataColLabels;
 784 
 785   for $ColNum (0 .. $#DataColLabels) {
 786     $ColLabel = $DataColLabels[$ColNum];
 787     $DataColLabelToNumMap{$ColLabel} = $ColNum;
 788     $DataColNumToLabelMap{$ColNum} = $ColLabel;
 789   }
 790 
 791   if ($OptionsInfo{DatabaseDataColsMode} =~ /^Specify$/i) {
 792     if ($OptionsInfo{DatabaseColMode} =~ /^ColNum$/i) {
 793       for $ColNum (@{$OptionsInfo{SpecifiedDatabaseDataCols}}) {
 794         if ($ColNum > $NumOfCols) {
 795           die "Error: Column number, $ColNum, specified using \"--DatabaseDataCols\" is not valid: It must be <= $NumOfCols\n";
 796         }
 797         push @DataColNumsToOutput, ($ColNum - 1);
 798       }
 799     }
 800     elsif ($OptionsInfo{DatabaseColMode} =~ /^ColLabel$/i) {
 801       for $ColLabel (@{$OptionsInfo{SpecifiedDatabaseDataCols}}) {
 802         if (!exists $DataColLabelToNumMap{$ColLabel}) {
 803           die "Error: Column label, $ColLabel, specified using \"--DatabaseDataCols\" is not valid: It doesn't exist\n";
 804         }
 805         push @DataColNumsToOutput, $DataColLabelToNumMap{$ColLabel};
 806       }
 807     }
 808   }
 809   elsif ($OptionsInfo{DatabaseDataColsMode} =~ /^All$/i) {
 810     @DataColNumsToOutput = map { $_ } (0 .. $#DataColLabels);
 811   }
 812 
 813   # Setup data column labels to output...
 814   if (scalar @DataColNumsToOutput) {
 815     @DataColLabelsToOutput = map { $DataColNumToLabelMap{$_} } (0 .. $#DataColNumsToOutput);
 816   }
 817 
 818   $FingerprintsFilesInfo{Database}{OutputDataCols} = scalar @DataColNumsToOutput ? 1 : 0;
 819 
 820   @{$FingerprintsFilesInfo{Database}{DataColLabels}} = @DataColLabels;
 821   %{$FingerprintsFilesInfo{Database}{DataColLabelToNumMap}} = %DataColLabelToNumMap;
 822   %{$FingerprintsFilesInfo{Database}{DataColNumToLabelMap}} = %DataColNumToLabelMap;
 823 
 824   @{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}} = @DataColNumsToOutput;
 825   @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}} = @DataColLabelsToOutput;
 826 }
 827 
 828 # Set maximum number of similar compounds to find for individual reference of set of multiple
 829 # reference compounds...
 830 #
 831 sub SetMaximumSimilarMoleculesToRetrieve {
 832   my($FingerprintsFile, $FileType, $InDelim) = @_;
 833   my($MaxSimilarMolecules, $NumOfDatabaseMolecules, $PercentSimilarMolecules, $Line);
 834 
 835   if ($OptionsInfo{SimilarCountMode} !~ /^PercentSimilar$/i) {
 836     return;
 837   }
 838 
 839   $PercentSimilarMolecules = $OptionsInfo{PercentSimilarMolecules};
 840 
 841   # Count database entries to figure out MaxSimilarMolecules using PercentSimilarMolecules
 842   # value...
 843   $NumOfDatabaseMolecules = 0;
 844   if ($FileType =~ /^SD$/i && exists($FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules})) {
 845     # It might already be counted for SD file...
 846     $NumOfDatabaseMolecules = $FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules};
 847   }
 848   else {
 849     print "Calculating maximum number of similar molecules to retrieve for \"PercentSimilar\" value of \"--SimilarCountMode\" option by counting number of molecules in database fingerprints file...\n";
 850     open FINGERPRINTSFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n";
 851     FILETYPE: {
 852       if ($FileType =~ /^SD$/i) {
 853         while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) {
 854           if ($Line =~ /^\$\$\$\$/) {
 855             $NumOfDatabaseMolecules++;
 856           }
 857         }
 858         last FILETYPE;
 859       }
 860       if ($FileType =~ /^Text$/i) {
 861         # Ignore column label line...
 862         $Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE);
 863         while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) {
 864           $NumOfDatabaseMolecules++;
 865         }
 866         last FILETYPE;
 867       }
 868       if ($FileType =~ /^FP$/i) {
 869         while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) {
 870           if ($Line !~ /^#/) {
 871             $NumOfDatabaseMolecules++;
 872           }
 873         }
 874         last FILETYPE;
 875       }
 876       $NumOfDatabaseMolecules = 0;
 877     }
 878     close FINGERPRINTSFILE;
 879     $FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules} = $NumOfDatabaseMolecules;
 880   }
 881 
 882   $MaxSimilarMolecules = int (($NumOfDatabaseMolecules * $PercentSimilarMolecules)/100);
 883   if ($MaxSimilarMolecules < 1) {
 884     $MaxSimilarMolecules = 1;
 885   }
 886 
 887   $OptionsInfo{MaxSimilarMolecules} = $MaxSimilarMolecules;
 888 }
 889 
 890 # Retrieve information about fingerprints file...
 891 #
 892 sub RetrieveFingerprintsFileInfo {
 893   my($FingerprintsFile) = @_;
 894   my($FileType, $InDelim, $FileDir, $FileExt, $FileName);
 895 
 896   if (!(-e $FingerprintsFile)) {
 897     die "Error: Input fingerprints file, $FingerprintsFile, doesn't exist.\n";
 898   }
 899 
 900   $FileType = Fingerprints::FingerprintsFileUtil::GetFingerprintsFileType($FingerprintsFile);
 901   if (IsEmpty($FileType)) {
 902     die "Error: Input file, $FingerprintsFile, is not a fingerprints file.\n";
 903   }
 904 
 905   $InDelim = '';
 906   if ($FileType =~ /^Text$/i) {
 907     $FileDir = ""; $FileName = ""; $FileExt = "";
 908     ($FileDir, $FileName, $FileExt) = ParseFileName($FingerprintsFile);
 909     $InDelim = ($FileExt =~ /^tsv$/i) ? 'Tab' : $OptionsInfo{InDelim};
 910   }
 911 
 912   return ($FileType, $InDelim);
 913 }
 914 
 915 # Retrieve fingerprints file IO parameters...
 916 #
 917 sub RetrieveFingerprintsFileIOParameters {
 918   my($FingerprintsFileMode, $FileType, $FingerprintsFile) = @_;
 919   my(%FingerprintsFileIOParams);
 920 
 921   if ($FingerprintsFileMode !~ /^(Reference|Database)$/) {
 922     die "Error: Unknown fingerprints file mode: $FingerprintsFileMode. Supported values: Reference or Database\n";
 923   }
 924 
 925   %FingerprintsFileIOParams = ();
 926 
 927   FILETYPE: {
 928     if ($FileType =~ /^SD$/i) {
 929       %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' =>  $OptionsInfo{Detail}, 'FingerprintsFieldLabel' => $OptionsInfo{"${FingerprintsFileMode}FingerprintsField"}, 'CompoundIDMode' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDMode"}, 'CompoundIDFieldLabel' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDField"}, 'CompoundIDPrefix' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDPrefix"});
 930       last FILETYPE;
 931     }
 932     if ($FileType =~ /^FP$/i) {
 933       %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' =>  $OptionsInfo{Detail});
 934       last FILETYPE;
 935     }
 936     if ($FileType =~ /^Text$/i) {
 937       %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' =>  $OptionsInfo{Detail}, 'FingerprintsCol' => $OptionsInfo{"${FingerprintsFileMode}FingerprintsCol"}, 'ColMode' => $OptionsInfo{"${FingerprintsFileMode}ColMode"}, 'CompoundIDCol' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDCol"}, 'CompoundIDPrefix' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDPrefix"}, 'InDelim' => $FingerprintsFilesInfo{$FingerprintsFileMode}{InDelim});
 938       last FILETYPE;
 939     }
 940     die "Error: Fingerprints file type, $FileType, is not valid. Supported file types: SD, FP or Text\n";
 941   }
 942 
 943   return %FingerprintsFileIOParams;
 944 }
 945 
 946 # Make sure fingerprints data file contains valid dta and retrieve fingerprints string mode information...
 947 #
 948 sub RetrieveFingerprintsFileFingerprintsStringInfo {
 949   my($FingerprintsFileMode, $FingerprintsFile) = @_;
 950   my($FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription);
 951 
 952   $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{$FingerprintsFileMode}{FingerprintsFileIOParameters}});
 953   if (!$FingerprintsFileIO) {
 954     die "Error: Reference fingerprints file, $FingerprintsFile, contains invalid fingerprints data.\n";
 955   }
 956   if (!$FingerprintsFileIO->IsFingerprintsFileDataValid()) {
 957     die "Error: Reference fingerprints file, $FingerprintsFile, contains invalid fingerprints data.\n";
 958   }
 959 
 960   $FingerprintsStringMode = $FingerprintsFileIO->GetFingerprintsStringMode();
 961   $FingerprintsBitVectorStringMode = $FingerprintsFileIO->GetFingerprintsBitVectorStringMode();
 962   $FingerprintsVectorStringMode = $FingerprintsFileIO->GetFingerprintsVectorStringMode();
 963 
 964   $FirstFingerprintsStringType = $FingerprintsFileIO->GetFirstFingerprintsStringType();
 965   $FirstFingerprintsStringDescription = $FingerprintsFileIO->GetFirstFingerprintsStringDescription();
 966 
 967   $FingerprintsFileIO->Close();
 968 
 969   return ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription);
 970 }
 971 
 972 # Retrieve output files names using reference fingerprints file name...
 973 #
 974 sub RetrieveOutputFilesInfo {
 975   my($FingerprintsFile, $FileDir, $FileExt, $FileName, $OutFileRoot, $SDOutFileName, $TextOutFileName, $SDOutFileExt, $TextOutFileExt, $ReferenceFileName, $DatabaseFileName);
 976 
 977   $OutputFilesInfo{OutFileRoot} = '';
 978   $OutputFilesInfo{SDOutFileName} = '';
 979   $OutputFilesInfo{TextOutFileName} = '';
 980 
 981   $FingerprintsFile = $FingerprintsFilesInfo{Reference}{FileName};
 982 
 983   $FileDir = ""; $FileName = ""; $FileExt = "";
 984   ($FileDir, $FileName, $FileExt) = ParseFileName($FingerprintsFile);
 985 
 986   $SDOutFileExt = "sdf";
 987   $TextOutFileExt = ($Options{outdelim} =~ /^tab$/i) ? "tsv" : "csv";
 988 
 989   if ($OptionsInfo{OutFileRoot}) {
 990     my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 991     if ($RootFileName && $RootFileExt) {
 992       $FileName = $RootFileName;
 993     }
 994     else {
 995       $FileName = $OptionsInfo{OutFileRoot};
 996     }
 997     $OutFileRoot = $FileName;
 998   }
 999   else {
1000     $OutFileRoot = "${FileName}SimilaritySearching";
1001   }
1002 
1003   $SDOutFileName = "${OutFileRoot}.${SDOutFileExt}";
1004   $TextOutFileName = "${OutFileRoot}.${TextOutFileExt}";
1005 
1006   $ReferenceFileName = $FingerprintsFilesInfo{Reference}{FileName};
1007   $DatabaseFileName = $FingerprintsFilesInfo{Database}{FileName};
1008 
1009   if ($OptionsInfo{SDOutput}) {
1010     if ($SDOutFileName =~ /^$ReferenceFileName$/i) {
1011       die "Error: Same output, $SDOutFileName, and reference input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n";
1012     }
1013     if ($SDOutFileName =~ /^$DatabaseFileName$/i) {
1014       die "Error: Same output, $SDOutFileName, and database input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n";
1015     }
1016   }
1017 
1018   if ($OptionsInfo{TextOutput}) {
1019     if ($TextOutFileName =~ /^$ReferenceFileName$/i) {
1020       die "Error: Same output, $TextOutFileName, and reference input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n";
1021     }
1022     if ($TextOutFileName =~ /^$DatabaseFileName$/i) {
1023       die "Error: Same output, $TextOutFileName, and database input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n";
1024     }
1025   }
1026 
1027   if (!$OptionsInfo{OverwriteFiles}) {
1028     if ($OptionsInfo{SDOutput}) {
1029       if (-e $SDOutFileName) {
1030         die "Error: The output file $SDOutFileName already exists.\n";
1031       }
1032     }
1033     if ($OptionsInfo{TextOutput}) {
1034       if (-e $TextOutFileName) {
1035         die "Error: The output file $TextOutFileName already exists.\n";
1036       }
1037     }
1038   }
1039 
1040   $OutputFilesInfo{OutFileRoot} = $OutFileRoot;
1041   $OutputFilesInfo{SDOutFileName} = $SDOutFileName;
1042   $OutputFilesInfo{TextOutFileName} = $TextOutFileName;
1043 
1044 }
1045 
1046 # Process input fingerprints file names...
1047 #
1048 sub ProcessFingerprintsFileNames {
1049   @FingerprintsFilesList = ();
1050 
1051   if (@ARGV != 2) {
1052     die GetUsageFromPod("$FindBin::Bin/$ScriptName");
1053   }
1054 
1055   # Reference fingerprints file name...
1056   push @FingerprintsFilesList, $ARGV[0];
1057 
1058   # Database fingerprints file name...
1059   push @FingerprintsFilesList, $ARGV[1];
1060 
1061 }
1062 
1063 # Process option values...
1064 sub ProcessOptions {
1065   %OptionsInfo = ();
1066 
1067   $OptionsInfo{Mode} = $Options{mode};
1068   $OptionsInfo{FingerprintsMode} = $Options{fingerprintsmode};
1069 
1070   $OptionsInfo{SearchMode} = $Options{searchmode};
1071 
1072   ProcessBitVectorComparisonOptions();
1073   ProcessVectorComparisonOptions();
1074 
1075   $OptionsInfo{GroupFusionRule} = $Options{groupfusionrule};
1076   $OptionsInfo{GroupFusionApplyCutoff} = ($Options{groupfusionapplycutoff} =~ /^Yes$/i) ? 1 : 0;;
1077 
1078   $OptionsInfo{SimilarCountMode} = $Options{similarcountmode};
1079   $OptionsInfo{NumOfSimilarMolecules} = $Options{numofsimilarmolecules};
1080   $OptionsInfo{PercentSimilarMolecules} = $Options{percentsimilarmolecules};
1081 
1082   # Set MaxSimilarMolecules to NumOfSimilarMolecules. For PercentSimilar value of SimilarCountMode,
1083   # it'll be overwritten using number of entries in database fingerprints file and value of PercentSimilarMolecules...
1084   #
1085   $OptionsInfo{MaxSimilarMolecules} = $OptionsInfo{NumOfSimilarMolecules};
1086 
1087   $OptionsInfo{SimilarityCutoff} = $Options{similaritycutoff};
1088   $OptionsInfo{DistanceCutoff} = $Options{distancecutoff};
1089 
1090   $OptionsInfo{kNN} = $Options{knn};
1091   if ($Options{knn} !~ /^All$/i) {
1092     if (!IsPositiveInteger($Options{knn})) {
1093       die "Error: The value specified, $Options{knn}, for option \"-k, --KNN\" is not valid. Allowed values: > 0 \n";
1094     }
1095   }
1096 
1097   ProcessReferenceFingerprintsDataOptions();
1098   ProcessDatabaseFingerprintsDataOptions();
1099 
1100   $OptionsInfo{Detail} = $Options{detail};
1101 
1102   $OptionsInfo{InDelim} = $Options{indelim};
1103   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
1104   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
1105 
1106   $OptionsInfo{Output} = $Options{output};
1107   $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|Both)$/i) ? 1 : 0;
1108   $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|Both)$/i) ? 1 : 0;
1109 
1110   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
1111   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
1112 
1113   $OptionsInfo{Fast} = $Options{fast} ? 1 : 0;
1114   $OptionsInfo{ValidateData} = $Options{fast} ? 0 : 1;
1115 
1116   $OptionsInfo{Precision} = $Options{precision};
1117 }
1118 
1119 # Process options related to comparion of bit vector strings...
1120 #
1121 sub ProcessBitVectorComparisonOptions {
1122   # Setup supported bit vector similarity coefficients for bit vector strings...
1123   my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap);
1124 
1125   @SupportedComparisonMeasures = ();
1126   %SupportedComparisonMeasuresNameMap = ();
1127   %SupportedComparisonMeasuresMethodMap = ();
1128 
1129   for $SupportedComparisonMeasure (Fingerprints::FingerprintsBitVector::GetSupportedSimilarityCoefficients()) {
1130     # Similarity coefficient function/method names contain "Coefficient" in their names.
1131     # So take 'em out and setup a map to original function/method name...
1132     $ComparisonMeasure = $SupportedComparisonMeasure;
1133     $ComparisonMeasure =~ s/Coefficient$//;
1134 
1135     push @SupportedComparisonMeasures, $ComparisonMeasure;
1136     $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure;
1137     $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure;
1138   }
1139 
1140   # Setup similarity coefficient to use for calculating similarity matrices for bit vector strings...
1141   my($SpecifiedMeasure, $SpecifiedComparisonMeasureName, $SpecifiedComparisonMeasureMethod);
1142 
1143   $SpecifiedComparisonMeasureName = '';
1144   $SpecifiedComparisonMeasureMethod = '';
1145 
1146   $SpecifiedMeasure = $Options{bitvectorcomparisonmode};
1147 
1148   if (! exists $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)} )  {
1149       die "Error: The value specified, $SpecifiedMeasure, for option \"-b --BitVectorComparisonMode\" is not valid.\nAllowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n";
1150   }
1151 
1152   $SpecifiedComparisonMeasureMethod = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)};
1153   $SpecifiedComparisonMeasureName = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)};
1154 
1155   $OptionsInfo{BitVectorComparisonMode} = $Options{bitvectorcomparisonmode};
1156 
1157   $OptionsInfo{SpecifiedBitVectorComparisonMeasure} = $SpecifiedMeasure;
1158   $OptionsInfo{SpecifiedBitVectorComparisonMeasureName} = $SpecifiedComparisonMeasureName;
1159   $OptionsInfo{SpecifiedBitVectorComparisonMeasureMethod} = $SpecifiedComparisonMeasureMethod;
1160 
1161   # Make sure valid alpha parameter is specified for Tversky calculation...
1162   $OptionsInfo{Alpha} = '';
1163   if ($SpecifiedMeasure =~ /^(TverskySimilarity|WeightedTverskySimilarity)$/i) {
1164     if (IsEmpty($Options{alpha})) {
1165       die "Error: You must specify a value for \"-a, --alpha\" option in \"TverskySimilarity or WeightedTverskySimilarity\" \"-m --mode\". \n";
1166     }
1167     my($Alpha);
1168     $Alpha = $Options{alpha};
1169     if (!(IsFloat($Alpha) && $Alpha >=0 && $Alpha <= 1)) {
1170       die "Error: The value specified, $Options{alpha}, for option \"-a, --alpha\" is not valid. Allowed values: >= 0 and <= 1\n";
1171     }
1172     $OptionsInfo{Alpha} = $Alpha;
1173   }
1174 
1175   # Make sure valid beta parameter is specified for WeightedTanimoto and WeightedTversky
1176   # calculations...
1177   $OptionsInfo{Beta} = '';
1178   if ($SpecifiedMeasure =~ /^(WeightedTverskySimilarity|WeightedTanimotoSimilarity)$/i) {
1179     if (IsEmpty($Options{beta})) {
1180       die "Error: You must specify a value for \"-b, --beta\" option in \"WeightedTverskySimilarity or WeightedTanimotoSimilarity\" \"-m --mode\". \n";
1181     }
1182     my($Beta);
1183     $Beta = $Options{beta};
1184     if (!(IsFloat($Beta) && $Beta >=0 && $Beta <= 1)) {
1185       die "Error: The value specified, $Options{beta}, for option \"-b, --beta\" is not valid. Allowed values: >= 0 and <= 1\n";
1186     }
1187     $OptionsInfo{Beta} = $Beta;
1188   }
1189 }
1190 
1191 # Process options related to comparion of vector strings...
1192 #
1193 sub ProcessVectorComparisonOptions {
1194   # Setup specified similarity coefficients for vector strings..
1195   my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap);
1196 
1197   @SupportedComparisonMeasures = ();
1198   %SupportedComparisonMeasuresNameMap = ();
1199   %SupportedComparisonMeasuresMethodMap = ();
1200   for $SupportedComparisonMeasure (Fingerprints::FingerprintsVector::GetSupportedDistanceAndSimilarityCoefficients()) {
1201     # Similarity and distance coefficient function/method names contain "Coefficient" in their names.
1202     # So take 'em out and setup a map to original function/method name...
1203     $ComparisonMeasure = $SupportedComparisonMeasure;
1204     if ($ComparisonMeasure =~ /Coefficient$/i) {
1205       $ComparisonMeasure =~ s/Coefficient$//i;
1206     }
1207     push @SupportedComparisonMeasures, $ComparisonMeasure;
1208     $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure;
1209     $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure;
1210   }
1211 
1212   # Setup a list of similarity coefficients to use for calculating similarity matrices for bit vector strings...
1213   my($SpecifiedMeasure, $SpecifiedComparisonMeasureName, $SpecifiedComparisonMeasureMethod);
1214 
1215   $SpecifiedComparisonMeasureName = '';
1216   $SpecifiedComparisonMeasureMethod = '';
1217 
1218   $SpecifiedMeasure = $Options{vectorcomparisonmode};
1219   $SpecifiedMeasure =~ s/ //g;
1220 
1221   if (! exists($SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)})) {
1222     die "Error: The value specified, $SpecifiedMeasure, for option \"-v --VectorComparisonMode\" is not valid.\nAllowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n";
1223   }
1224 
1225   $SpecifiedComparisonMeasureMethod = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)};
1226   $SpecifiedComparisonMeasureName = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)};
1227 
1228   $OptionsInfo{VectorComparisonMode} = $Options{vectorcomparisonmode};
1229 
1230   $OptionsInfo{SpecifiedVectorComparisonMeasure} = $SpecifiedMeasure;
1231   $OptionsInfo{SpecifiedVectorComparisonMeasuresName} = $SpecifiedComparisonMeasureName;
1232   $OptionsInfo{SpecifiedVectorComparisonMeasuresMethod} = $SpecifiedComparisonMeasureMethod;
1233 
1234   # Setup specified vector comparison calculation modes...
1235   my($SpecifiedFormulism);
1236 
1237   $SpecifiedFormulism = $Options{vectorcomparisonformulism};
1238   $SpecifiedFormulism =~ s/ //g;
1239   if ($SpecifiedFormulism !~ /^(AlgebraicForm|BinaryForm|SetTheoreticForm)$/i) {
1240     die "Error: The value specified, $SpecifiedFormulism, for option \"--VectorComparisonFormulism\" is not valid. Allowed values: AlgebraicForm, BinaryForm or SetTheoreticForm\n";
1241   }
1242 
1243   $OptionsInfo{VectorComparisonFormulism} = $Options{vectorcomparisonformulism};
1244   $OptionsInfo{SpecifiedVectorComparisonMode} = $SpecifiedFormulism;
1245 
1246 }
1247 
1248 # Process options related to data retrieval from reference fingerprints SD and CSV/TSV
1249 # text files...
1250 #
1251 sub ProcessReferenceFingerprintsDataOptions {
1252 
1253   $OptionsInfo{ReferenceCompoundIDPrefix} = $Options{referencecompoundidprefix} ? $Options{referencecompoundidprefix} : 'Cmpd';
1254 
1255   # Compound ID and fingerprints column options for text files...
1256 
1257   $OptionsInfo{ReferenceColMode} = $Options{referencecolmode};
1258 
1259   if (IsNotEmpty($Options{referencecompoundidcol})) {
1260     if ($Options{referencecolmode} =~ /^ColNum$/i) {
1261       if (!IsPositiveInteger($Options{referencecompoundidcol})) {
1262         die "Error: Column value, $Options{referencecompoundidcol}, specified using \"--ReferenceCompoundIDCol\" is not valid: Allowed integer values: > 0\n";
1263       }
1264     }
1265     $OptionsInfo{ReferenceCompoundIDCol} = $Options{referencecompoundidcol};
1266   }
1267   else {
1268     $OptionsInfo{ReferenceCompoundIDCol} = 'AutoDetect';
1269   }
1270 
1271   if (IsNotEmpty($Options{referencefingerprintscol})) {
1272     if ($Options{referencecolmode} =~ /^ColNum$/i) {
1273       if (!IsPositiveInteger($Options{referencefingerprintscol})) {
1274         die "Error: Column value, $Options{referencefingerprintscol}, specified using \"--ReferenceFingerprintsCol\" is not valid: Allowed integer values: > 0\n";
1275       }
1276     }
1277     $OptionsInfo{ReferenceFingerprintsCol} = $Options{referencefingerprintscol};
1278   }
1279   else {
1280     $OptionsInfo{ReferenceFingerprintsCol} = 'AutoDetect';
1281   }
1282 
1283   if (IsNotEmpty($Options{referencecompoundidcol}) && IsNotEmpty($Options{referencefingerprintscol})) {
1284     if (IsPositiveInteger($Options{referencecompoundidcol}) && IsPositiveInteger($Options{referencefingerprintscol})) {
1285       if (($Options{referencecompoundidcol} == $Options{referencefingerprintscol})) {
1286         die "Error: Values specified using \"--ReferenceCompoundIDCol\" and \"--ReferenceFingerprintsCol\", $Options{referencecompoundidcol}, must be different.\n";
1287       }
1288     }
1289     else {
1290       if (($Options{referencecompoundidcol} eq $Options{referencefingerprintscol})) {
1291         die "Error: Values specified using \"--ReferenceCompoundIDCol\" and \"--ReferenceFingerprintsCol\", $Options{referencecompoundidcol}, must be different.\n";
1292       }
1293     }
1294   }
1295 
1296   # Compound ID and fingerprints field options for SD files...
1297 
1298   $OptionsInfo{ReferenceCompoundIDMode} = $Options{referencecompoundidmode};
1299   $OptionsInfo{ReferenceCompoundIDField} = '';
1300 
1301   if ($Options{referencecompoundidmode} =~ /^DataField$/i && !$Options{referencecompoundidfield}) {
1302     die "Error: You must specify a value for \"--ReferenceCompoundIDField\" option in \"DataField\" \"--ReferenceCompoundIDMode\". \n";
1303   }
1304   if ($Options{referencecompoundidfield}) {
1305     $OptionsInfo{ReferenceCompoundIDField} = $Options{referencecompoundidfield};
1306   }
1307 
1308   if (IsNotEmpty($Options{referencefingerprintsfield})) {
1309     $OptionsInfo{ReferenceFingerprintsField} = $Options{referencefingerprintsfield};
1310   }
1311   else {
1312     $OptionsInfo{ReferenceFingerprintsField} = 'AutoDetect';
1313   }
1314 
1315   if ($Options{referencecompoundidfield} && IsNotEmpty($Options{referencefingerprintsfield})) {
1316     if (($Options{referencecompoundidfield} eq $Options{referencefingerprintsfield})) {
1317       die "Error: Values specified using \"--ReferenceCompoundIDField\" and \"--ReferenceFingerprintsfield\", $Options{referencecompoundidfield}, must be different.\n";
1318     }
1319   }
1320 
1321 }
1322 
1323 # Process options related to data retrieval from database fingerprints SD and CSV/TSV
1324 # text files...
1325 #
1326 sub ProcessDatabaseFingerprintsDataOptions {
1327 
1328   $OptionsInfo{DatabaseCompoundIDPrefix} = $Options{databasecompoundidprefix} ? $Options{databasecompoundidprefix} : 'Cmpd';
1329 
1330   # Compound ID and fingerprints column options for text files...
1331 
1332   $OptionsInfo{DatabaseColMode} = $Options{databasecolmode};
1333 
1334   if (IsNotEmpty($Options{databasecompoundidcol})) {
1335     if ($Options{databasecolmode} =~ /^ColNum$/i) {
1336       if (!IsPositiveInteger($Options{databasecompoundidcol})) {
1337         die "Error: Column value, $Options{databasecompoundidcol}, specified using \"--DatabaseCompoundIDCol\" is not valid: Allowed integer values: > 0\n";
1338       }
1339     }
1340     $OptionsInfo{DatabaseCompoundIDCol} = $Options{databasecompoundidcol};
1341   }
1342   else {
1343     $OptionsInfo{DatabaseCompoundIDCol} = 'AutoDetect';
1344   }
1345 
1346   if (IsNotEmpty($Options{databasefingerprintscol})) {
1347     if ($Options{databasecolmode} =~ /^ColNum$/i) {
1348       if (!IsPositiveInteger($Options{databasefingerprintscol})) {
1349         die "Error: Column value, $Options{databasefingerprintscol}, specified using \"--DatabaseFingerprintsCol\" is not valid: Allowed integer values: > 0\n";
1350       }
1351     }
1352     $OptionsInfo{DatabaseFingerprintsCol} = $Options{databasefingerprintscol};
1353   }
1354   else {
1355     $OptionsInfo{DatabaseFingerprintsCol} = 'AutoDetect';
1356   }
1357 
1358   if (IsNotEmpty($Options{databasecompoundidcol}) && IsNotEmpty($Options{databasefingerprintscol})) {
1359     if (IsPositiveInteger($Options{databasecompoundidcol}) && IsPositiveInteger($Options{databasefingerprintscol})) {
1360       if (($Options{databasecompoundidcol} == $Options{databasefingerprintscol})) {
1361         die "Error: Values specified using \"--DatabaseCompoundIDCol\" and \"--DatabaseFingerprintsCol\", $Options{databasecompoundidcol}, must be different.\n";
1362       }
1363     }
1364     else {
1365       if (($Options{databasecompoundidcol} eq $Options{databasefingerprintscol})) {
1366         die "Error: Values specified using \"--DatabaseCompoundIDCol\" and \"--DatabaseFingerprintsCol\", $Options{databasecompoundidcol}, must be different.\n";
1367       }
1368     }
1369   }
1370 
1371   # Database data column options for text files...
1372 
1373   $OptionsInfo{DatabaseDataColsMode} = $Options{databasedatacolsmode};
1374   $OptionsInfo{DatabaseDataCols} = '';
1375   @{$OptionsInfo{SpecifiedDatabaseDataCols}} = ();
1376 
1377   if ($Options{databasedatacolsmode} =~ /^Specify$/i) {
1378     my($DatabaseDataCols, $DatabaseColNum, @SpecifiedDataCols);
1379 
1380     if (!$Options{databasedatacols}) {
1381       die "Error: You must specify a value for \"--DatabaseDataCols\" option in \"Specify\" \"--DatabaseDataColsMode\". \n";
1382     }
1383     $DatabaseDataCols = $Options{databasedatacols};
1384 
1385     if ($Options{databasecolmode} =~ /^ColNum$/i) {
1386       $DatabaseDataCols =~ s/ //g;
1387       @SpecifiedDataCols = split /\,/, $DatabaseDataCols;
1388       for $DatabaseColNum (@SpecifiedDataCols) {
1389         if (!IsPositiveInteger($DatabaseColNum)) {
1390           die "Error: Column value, $DatabaseColNum, specified using \"--DatabaseDataCols\" is not valid: Allowed integer values: > 0\n";
1391         }
1392       }
1393     }
1394     else {
1395       @SpecifiedDataCols = split /\,/, $DatabaseDataCols;
1396     }
1397     $OptionsInfo{DatabaseDataCols} = $DatabaseDataCols;
1398     push @{$OptionsInfo{SpecifiedDatabaseDataCols}}, @SpecifiedDataCols;
1399   }
1400   elsif ($Options{databasedatacolsmode} =~ /^All$/i) {
1401     $OptionsInfo{DatabaseDataCols} = 'All';
1402   }
1403 
1404   if ($OptionsInfo{DatabaseDataColsMode} =~ /^Specify$/i && !$OptionsInfo{DatabaseDataCols}) {
1405     die "Error: You must specify a value for \"--DatabaseDataCols\" option in \"Specify\" \"--DatabaseDataColsMode\". \n";
1406   }
1407 
1408   # Compound ID and fingerprints field options for SD files...
1409 
1410   $OptionsInfo{DatabaseCompoundIDMode} = $Options{databasecompoundidmode};
1411   $OptionsInfo{DatabaseCompoundIDField} = $Options{databasecompoundidfield} ? $Options{databasecompoundidfield} : '';
1412 
1413   if ($Options{databasecompoundidmode} =~ /^DataField$/i) {
1414     if (!$Options{databasecompoundidfield}) {
1415       die "Error: You must specify a value for \"--DatabaseCompoundIDField\" option in \"DataField\" \"--DatabaseCompoundIDMode\". \n";
1416     }
1417     $OptionsInfo{DatabaseCompoundIDField} = $Options{databasecompoundidfield};
1418   }
1419 
1420 
1421   if (IsNotEmpty($Options{databasefingerprintsfield})) {
1422     $OptionsInfo{DatabaseFingerprintsField} = $Options{databasefingerprintsfield};
1423   }
1424   else {
1425     $OptionsInfo{DatabaseFingerprintsField} = 'AutoDetect';
1426   }
1427 
1428   if ($Options{databasecompoundidfield} && IsNotEmpty($Options{databasefingerprintsfield})) {
1429     if (($Options{databasecompoundidfield} eq $Options{databasefingerprintsfield})) {
1430       die "Error: Values specified using \"--DatabaseCompoundIDField\" and \"--DatabaseFingerprintsfield\", $Options{databasecompoundidfield}, must be different.\n";
1431     }
1432   }
1433 
1434   # Database data field options for SD files...
1435 
1436   $OptionsInfo{DatabaseDataFieldsMode} = $Options{databasedatafieldsmode};
1437   $OptionsInfo{DatabaseDataFields} = '';
1438   @{$OptionsInfo{SpecifiedDatabaseDataFields}} = ();
1439 
1440   if ($Options{databasedatafieldsmode} =~ /^Specify$/i && !$Options{databasedatafields}) {
1441     die "Error: You must specify a value for \"--DatabaseDataFields\" option in \"Specify\" \"--DatabaseDataFieldsMode\". \n";
1442   }
1443   if ($Options{databasedatafields}) {
1444     my(@SpecifiedDataFields);
1445     $OptionsInfo{DatabaseDataFields} = $Options{databasedatafields};
1446 
1447     @SpecifiedDataFields = split /\,/, $Options{databasedatafields};
1448     push @{$OptionsInfo{SpecifiedDatabaseDataFields}}, @SpecifiedDataFields;
1449   }
1450 }
1451 
1452 # Setup script usage  and retrieve command line arguments specified using various options...
1453 sub SetupScriptUsage {
1454 
1455   # Retrieve all the options...
1456   %Options = ();
1457 
1458   $Options{alpha} = 0.5;
1459   $Options{beta} = 1;
1460 
1461   $Options{bitvectorcomparisonmode} = "TanimotoSimilarity";
1462 
1463   $Options{databasecolmode} = 'colnum';
1464 
1465   $Options{databasecompoundidprefix} = 'Cmpd';
1466   $Options{databasecompoundidmode} = 'LabelPrefix';
1467 
1468   $Options{databasedatacolsmode} = 'CompoundID';
1469   $Options{databasedatafieldsmode} = 'CompoundID';
1470 
1471   $Options{distancecutoff} = 10;
1472 
1473   $Options{referencecolmode} = 'colnum';
1474 
1475   $Options{referencecompoundidprefix} = 'Cmpd';
1476   $Options{referencecompoundidmode} = 'LabelPrefix';
1477 
1478   $Options{detail} = 1;
1479 
1480   $Options{fingerprintsmode} = 'AutoDetect';
1481   $Options{groupfusionrule} = 'Max';
1482   $Options{groupfusionapplycutoff} = 'Yes';
1483 
1484   $Options{knn} = 'All';
1485 
1486   $Options{mode} = 'MultipleReferences';
1487 
1488   $Options{numofsimilarmolecules} = 10;
1489   $Options{percentsimilarmolecules} = 1;
1490 
1491   $Options{indelim} = 'comma';
1492   $Options{outdelim} = 'comma';
1493   $Options{quote} = 'yes';
1494 
1495   $Options{output} = 'text';
1496 
1497   $Options{precision} = 2;
1498 
1499   $Options{searchmode} = 'SimilaritySearch';
1500 
1501   $Options{similarcountmode} = 'NumOfSimilar';
1502 
1503   $Options{similaritycutoff} = 0.75;
1504 
1505   $Options{vectorcomparisonmode} = 'TanimotoSimilarity';
1506   $Options{vectorcomparisonformulism} = 'AlgebraicForm';
1507 
1508   if (!GetOptions(\%Options, "alpha=f", "beta=f", "bitvectorcomparisonmode|b=s", "databasecolmode=s", "databasecompoundidcol=s", "databasecompoundidprefix=s", "databasecompoundidfield=s", "databasecompoundidmode=s", "databasedatacols=s", "databasedatacolsmode=s", "databasedatafields=s", "databasedatafieldsmode=s", "databasefingerprintscol=s", "databasefingerprintsfield=s", "distancecutoff=f", "detail|d=i", "fast|f", "fingerprintsmode=s", "groupfusionrule|g=s", , "groupfusionapplycutoff=s", "help|h", "indelim=s", "knn|k=s", "mode|m=s", "numofsimilarmolecules|n=i", "outdelim=s", "output=s", "overwrite|o", "percentsimilarmolecules|p=f", "precision=s", "quote|q=s", "referencecolmode=s", "referencecompoundidcol=s", "referencecompoundidprefix=s", "referencecompoundidfield=s", "referencecompoundidmode=s", "referencefingerprintscol=s", "referencefingerprintsfield=s", "root|r=s", "searchmode|s=s", "similarcountmode=s", "similaritycutoff=f", "vectorcomparisonmode|v=s", "vectorcomparisonformulism=s", "workingdir|w=s")) {
1509     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
1510   }
1511   if ($Options{workingdir}) {
1512     if (! -d $Options{workingdir}) {
1513       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
1514     }
1515     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
1516   }
1517   if ($Options{databasecolmode} !~ /^(ColNum|ColLabel)$/i) {
1518     die "Error: The value specified, $Options{databasecolmode}, for option \"--DatabaseColMode\" is not valid. Allowed values: ColNum, or ColLabel\n";
1519   }
1520   if ($Options{databasecompoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
1521     die "Error: The value specified, $Options{databasecompoundidmode}, for option \"--DatabaseCompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
1522   }
1523   if ($Options{databasedatacolsmode} !~ /^(All|Specify|CompoundID)$/i) {
1524     die "Error: The value specified, $Options{databasedatacolsmode}, for option \"--DatabaseDataColsMode\" is not valid. Allowed values: All, Specify, or CompoundID\n";
1525   }
1526   if ($Options{databasedatafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
1527     die "Error: The value specified, $Options{databasedatafieldsmode}, for option \"--DatabaseDataFieldsMode\" is not valid. Allowed values: All, Common, Specify, or CompoundID\n";
1528   }
1529   if (!IsPositiveInteger($Options{detail})) {
1530     die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n";
1531   }
1532   if ($Options{fingerprintsmode} !~ /^(AutoDetect|FingerprintsBitVectorString|FingerprintsVectorString)$/i) {
1533     die "Error: The value specified, $Options{fingerprintsmode}, for option \"--FingerprintsMode\" is not valid. Allowed values: AutoDetect, FingerprintsBitVectorString or FingerprintsVectorString \n";
1534   }
1535   if ($Options{groupfusionrule} !~ /^(Max|Min|Mean|Median|Sum|Euclidean)$/i) {
1536     die "Error: The value specified, $Options{groupfusionrule}, for option \"-g, --GroupFusionRule\" is not valid. Allowed values: Max, Min, Mean, Median, Sum, Euclidean\n";
1537   }
1538   if ($Options{groupfusionapplycutoff} !~ /^(Yes|No)$/i) {
1539     die "Error: The value specified, $Options{quote}, for option \"--GroupFusionApplyCutoff\" is not valid. Allowed values: Yes or No\n";
1540   }
1541   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
1542     die "Error: The value specified, $Options{indelim}, for option \"--InDelim\" is not valid. Allowed values: comma, or semicolon\n";
1543   }
1544   if ($Options{mode} !~ /^(IndividualReference|MultipleReferences)$/i) {
1545     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: IndividualReference, MultipleReferences\n";
1546   }
1547   if (!IsPositiveInteger($Options{numofsimilarmolecules})) {
1548     die "Error: The value specified, $Options{numofsimilarmolecules}, for option \"-n, --NumOfSimilarMolecules\" is not valid. Allowed values: > 0 \n";
1549   }
1550   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
1551     die "Error: The value specified, $Options{outdelim}, for option \"--OutDelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1552   }
1553   if ($Options{output} !~ /^(SD|text|both)$/i) {
1554     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n";
1555   }
1556   if (!(IsFloat($Options{percentsimilarmolecules}) && $Options{percentsimilarmolecules} > 0 && $Options{percentsimilarmolecules} <= 100)) {
1557     die "Error: The value specified, $Options{percentsimilarmolecules}, for option \"-p, --PercentSimilarMolecules\" is not valid. Allowed values: > 0 and <= 100 \n";
1558   }
1559   if ($Options{quote} !~ /^(Yes|No)$/i) {
1560     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
1561   }
1562   if (!IsPositiveInteger($Options{precision})) {
1563     die "Error: The value specified, $Options{precision}, for option \"--precision\" is not valid. Allowed values: > 0 \n";
1564   }
1565   if ($Options{referencecolmode} !~ /^(ColNum|ColLabel)$/i) {
1566     die "Error: The value specified, $Options{referencecolmode}, for option \"--ReferenceColMode\" is not valid. Allowed values: ColNum, or ColLabel\n";
1567   }
1568   if ($Options{referencecompoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
1569     die "Error: The value specified, $Options{referencecompoundidmode}, for option \"--ReferenceCompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
1570   }
1571   if ($Options{searchmode} !~ /^(SimilaritySearch|DissimilaritySearch)$/i) {
1572     die "Error: The value specified, $Options{searchmode}, for option \"-s, --SearchMode\" is not valid. Allowed values: SimilaritySearch, DissimilaritySearch \n";
1573   }
1574   if ($Options{similarcountmode} !~ /^(NumOfSimilar|PercentSimilar)$/i) {
1575     die "Error: The value specified, $Options{similarcountmode}, for option \"--SimilarCountMode\" is not valid. Allowed values: NumOfSimilar, PercentSimilar \n";
1576   }
1577 }
1578