1 #!/usr/bin/perl -w 2 # 3 # File: SimilaritySearchingFingerprints.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use TextUtil; 34 use SDFileUtil; 35 use StatisticsUtil; 36 use PseudoHeap; 37 use Fingerprints::FingerprintsFileUtil; 38 use Fingerprints::FingerprintsBitVector; 39 use Fingerprints::FingerprintsVector; 40 41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 42 43 # Autoflush STDOUT 44 $| = 1; 45 46 # Starting message... 47 $ScriptName = basename($0); 48 print "\n$ScriptName: Starting...\n\n"; 49 $StartTime = new Benchmark; 50 51 # Get the options and setup script... 52 SetupScriptUsage(); 53 if ($Options{help} || @ARGV != 2) { 54 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 55 } 56 57 # Process reference and database file names... 58 my(@FingerprintsFilesList); 59 ProcessFingerprintsFileNames(); 60 61 # Process options... 62 print "Processing options...\n"; 63 my(%OptionsInfo); 64 ProcessOptions(); 65 66 # Setup information about fingerprints inut and SD/text output files... 67 my(%FingerprintsFilesInfo, %OutputFilesInfo, %SimilaritySearchInfo); 68 print "Checking and retrieving information from reference and database fingerprints files...\n"; 69 RetrieveFingerprintsFilesInfo(); 70 71 # Perform similarity search... 72 print "Performing similarity search...\n"; 73 my(%SimilaritySearchResults, %DatabaseFingerprintsFileData); 74 PerformSimilaritySearch(); 75 76 print "\n$ScriptName:Done...\n\n"; 77 78 $EndTime = new Benchmark; 79 $TotalTime = timediff ($EndTime, $StartTime); 80 print "Total time: ", timestr($TotalTime), "\n"; 81 82 ############################################################################### 83 84 # Perform similarity search using fingerprints data in reference and database text files... 85 # 86 sub PerformSimilaritySearch { 87 88 print "\nProcessing fingerprints data for reference molecules...\n"; 89 ReadReferenceFingerprintsData(); 90 91 InitializeSimilaritySearchResults(); 92 GenerateSimilaritySearchResults(); 93 WriteSimilaritySearchResultFiles(); 94 } 95 96 # Find similar molecules from database molecules for individual or multiple reference molecules... 97 # 98 sub GenerateSimilaritySearchResults { 99 my($DatabaseFingerprintsFileIO, $FingerprintsCount, $IgnoredFingerprintsCount, $DatabaseFingerprintsObject, $DatabaseCmpdID, $ReferenceFingerprintsObject, $ReferenceIndex, $ReferenceCmpdID, $ComparisonValue, $FusedComparisonValue, @ComparisonValues); 100 101 print "Processing fingerprints data for database molecules...\n"; 102 103 ($FingerprintsCount, $IgnoredFingerprintsCount) = (0) x 3; 104 105 $DatabaseFingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{Database}{FingerprintsFileIOParameters}}); 106 $DatabaseFingerprintsFileIO->Open(); 107 108 @ComparisonValues = (); 109 110 DATABASEFP: while ($DatabaseFingerprintsFileIO->Read()) { 111 $FingerprintsCount++; 112 113 if (!$DatabaseFingerprintsFileIO->IsFingerprintsDataValid()) { 114 $IgnoredFingerprintsCount++; 115 next DATABASEFP; 116 } 117 $DatabaseFingerprintsObject = $DatabaseFingerprintsFileIO->GetFingerprints(); 118 $DatabaseCmpdID = $DatabaseFingerprintsFileIO->GetCompoundID(); 119 120 if ($SimilaritySearchInfo{MultipleReferencesMode}) { 121 @ComparisonValues = (); 122 } 123 124 REFERENCEFP: for $ReferenceIndex (0 .. $#{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) { 125 $ReferenceCmpdID = $SimilaritySearchInfo{ReferenceCmpdIDsRef}->[$ReferenceIndex]; 126 $ReferenceFingerprintsObject = $SimilaritySearchInfo{ReferenceFingerprintsObjectsRef}->[$ReferenceIndex]; 127 128 $ComparisonValue = CompareReferenceAndDatabaseFingerprintsPair($ReferenceFingerprintsObject, $DatabaseFingerprintsObject); 129 if (!defined $ComparisonValue) { 130 next REFERENCEFP; 131 } 132 133 if ($SimilaritySearchInfo{IndividualReferenceMode}) { 134 CollectSimilaritySearchResults($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $ComparisonValue, $ReferenceCmpdID); 135 } 136 elsif ($SimilaritySearchInfo{MultipleReferencesMode}) { 137 push @ComparisonValues, $ComparisonValue; 138 } 139 } 140 141 if ($SimilaritySearchInfo{MultipleReferencesMode}) { 142 $FusedComparisonValue = CalculateGroupFusionComparisonValue(\@ComparisonValues); 143 if (!defined $FusedComparisonValue) { 144 next DATABASEFP; 145 } 146 CollectSimilaritySearchResults($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $FusedComparisonValue); 147 } 148 } 149 $DatabaseFingerprintsFileIO->Close(); 150 151 print "Number of fingerprints data entries in database fingerprints file: $FingerprintsCount\n"; 152 print "Number of fingerprints date entries processed successfully: ", ($FingerprintsCount - $IgnoredFingerprintsCount) , "\n"; 153 print "Number of fingerprints data entries ignored due to missing/invalid data: $IgnoredFingerprintsCount\n\n"; 154 } 155 156 # Compare a pair of reference and database fingerprints objects corresponding to bit-vector or 157 # vectors using specified comparison method and comparison cutoff... 158 # 159 sub CompareReferenceAndDatabaseFingerprintsPair { 160 my($ReferenceFingerprintsObject, $DatabaseFingerprintsObject) = @_; 161 my($ComparisonMethod, $ComparisonValue); 162 163 $ComparisonMethod = $SimilaritySearchInfo{ComparisonMethod}; 164 $ComparisonValue = $ReferenceFingerprintsObject->$ComparisonMethod($DatabaseFingerprintsObject, @{$SimilaritySearchInfo{ComparisonMethodParameters}}); 165 166 if (!defined $ComparisonValue) { 167 warn "Warning: Ignoring fingerprints data for reference compound ID ", $ReferenceFingerprintsObject->GetID(), ": Its comparison with database compound ID, ", $DatabaseFingerprintsObject->GetID(), ", failed.\n"; 168 return undef; 169 } 170 171 $ComparisonValue = sprintf("%.$OptionsInfo{Precision}f", $ComparisonValue); 172 173 # Apply any comparison cutoff... 174 if ($SimilaritySearchInfo{ApplyComparisonCutoff}) { 175 return $SimilaritySearchInfo{KeepTop} ? ($ComparisonValue >= $SimilaritySearchInfo{ComparisonCutoff} ? $ComparisonValue : undef) : ($ComparisonValue <= $SimilaritySearchInfo{ComparisonCutoff} ? $ComparisonValue : undef); 176 } 177 else { 178 return $ComparisonValue; 179 } 180 } 181 182 # Calculate group fusion comparison value... 183 # 184 sub CalculateGroupFusionComparisonValue { 185 my($ComparisonValuesRef) = @_; 186 my($FusedComparisonValue, @ComparisonValues); 187 188 if (!@{$ComparisonValuesRef}) { 189 return undef; 190 } 191 192 if ($SimilaritySearchInfo{SortComparisonValues}) { 193 @ComparisonValues = sort { $SimilaritySearchInfo{KeepTop} ? ($b <=> $a) : ($a <=> $b) } @{$ComparisonValuesRef}; 194 if ($SimilaritySearchInfo{UsekNN} && ($OptionsInfo{kNN} < scalar @{$ComparisonValuesRef})) { 195 # Keep only top kNN values for group fusion... 196 splice @ComparisonValues, $OptionsInfo{kNN}; 197 } 198 $ComparisonValuesRef = \@ComparisonValues; 199 } 200 201 $FusedComparisonValue = &{$SimilaritySearchInfo{GroupFusionMethodRef}}($ComparisonValuesRef); 202 if ($SimilaritySearchInfo{ApplyPrecisionDuringFusion}) { 203 $FusedComparisonValue = sprintf("%.$OptionsInfo{Precision}f", $FusedComparisonValue); 204 } 205 206 return $FusedComparisonValue; 207 } 208 209 # Collect similarity results for individual reference and multiple references search... 210 # 211 sub CollectSimilaritySearchResults { 212 my($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $ComparisonValue, $ReferenceCmpdID) = @_; 213 214 if (defined $ReferenceCmpdID) { 215 $SimilaritySearchResults{$ReferenceCmpdID}->AddKeyValuePair($ComparisonValue, $DatabaseCmpdID); 216 } 217 else { 218 $SimilaritySearchResults{ResultsPseudoHeap}->AddKeyValuePair($ComparisonValue, $DatabaseCmpdID); 219 } 220 221 if ($FingerprintsFilesInfo{Database}{CollectInputFileData}) { 222 CollectDatabaseFileData($DatabaseCmpdID, $DatabaseFingerprintsFileIO); 223 } 224 } 225 226 # Initialize similarity results for individual or multiple reference molecules... 227 # 228 sub InitializeSimilaritySearchResults { 229 my($ReferenceCmpdID); 230 231 %SimilaritySearchResults = (); 232 233 if ($SimilaritySearchInfo{IndividualReferenceMode}) { 234 for $ReferenceCmpdID (@{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) { 235 $SimilaritySearchResults{$ReferenceCmpdID} = new PseudoHeap('Type' => ($SimilaritySearchInfo{KeepTop} ? 'KeepTopN' : 'KeepBottomN'), 'KeyType' => 'Numeric', 'MaxSize' => $OptionsInfo{MaxSimilarMolecules}); 236 } 237 } 238 elsif ($SimilaritySearchInfo{MultipleReferencesMode}) { 239 $SimilaritySearchResults{ResultsPseudoHeap} = new PseudoHeap('Type' => ($SimilaritySearchInfo{KeepTop} ? 'KeepTopN' : 'KeepBottomN'), 'KeyType' => 'Numeric', 'MaxSize' => $OptionsInfo{MaxSimilarMolecules}); 240 } 241 242 %DatabaseFingerprintsFileData = (); 243 } 244 245 # Write out results SD and/or CSV/TSV text files for individual or multiple reference molecules... 246 # 247 sub WriteSimilaritySearchResultFiles { 248 my($NewSDFileRef, $NewTextFileRef, $ReferenceCmpdID, $DatabaseCmpdID, $ComparisonValue); 249 250 ($NewSDFileRef, $NewTextFileRef) = SetupAndOpenOutputFiles(); 251 252 if ($SimilaritySearchInfo{IndividualReferenceMode}) { 253 for $ReferenceCmpdID (@{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) { 254 for $ComparisonValue ($SimilaritySearchResults{$ReferenceCmpdID}->GetSortedKeys()) { 255 for $DatabaseCmpdID ($SimilaritySearchResults{$ReferenceCmpdID}->GetKeyValues($ComparisonValue)) { 256 WriteDataToOutputFiles($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID, $ReferenceCmpdID); 257 } 258 } 259 } 260 } 261 elsif ($SimilaritySearchInfo{MultipleReferencesMode}) { 262 for $ComparisonValue ($SimilaritySearchResults{ResultsPseudoHeap}->GetSortedKeys()) { 263 for $DatabaseCmpdID ($SimilaritySearchResults{ResultsPseudoHeap}->GetKeyValues($ComparisonValue)) { 264 WriteDataToOutputFiles($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID); 265 } 266 } 267 } 268 269 if ($NewSDFileRef) { 270 close $NewSDFileRef; 271 } 272 if ($NewTextFileRef) { 273 close $NewTextFileRef; 274 } 275 } 276 277 # Write individual reference or multiple references similarity results along with any other data to output files... 278 # 279 sub WriteDataToOutputFiles { 280 my($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID, $ReferenceCmpdID) = @_; 281 282 if ($NewSDFileRef) { 283 WriteMolStringDataToSDOutputFile($DatabaseCmpdID, $NewSDFileRef); 284 if (defined $ReferenceCmpdID) { 285 print $NewSDFileRef "> <ReferenceCmpdID>\n$ReferenceCmpdID\n\n"; 286 } 287 print $NewSDFileRef "> <DatabaseCmpdID>\n$DatabaseCmpdID\n\n> <ComparisonValue>\n$ComparisonValue\n\n"; 288 WriteDatabaseDataToSDOutputFile($DatabaseCmpdID, $NewSDFileRef); 289 print $NewSDFileRef "\$\$\$\$\n"; 290 } 291 292 if ($NewTextFileRef) { 293 my(@LineWords); 294 295 @LineWords = (); 296 if (defined $ReferenceCmpdID) { 297 push @LineWords, $ReferenceCmpdID; 298 } 299 push @LineWords, ($DatabaseCmpdID, $ComparisonValue); 300 301 if ($FingerprintsFilesInfo{Database}{OutputDataFields} || $FingerprintsFilesInfo{Database}{OutputDataCols}) { 302 push @LineWords, RetrieveDatabaseDataForTextOutputFile($DatabaseCmpdID); 303 } 304 print $NewTextFileRef JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}), "\n"; 305 } 306 } 307 308 # Open output files... 309 # 310 sub SetupAndOpenOutputFiles { 311 my($NewSDFileRef, $NewTextFileRef, $NewSDFile, $NewTextFile); 312 313 ($NewSDFileRef, $NewTextFileRef) = (undef) x 2; 314 315 if ($OptionsInfo{SDOutput}) { 316 $NewSDFile = $OutputFilesInfo{SDOutFileName}; 317 print "Generating SD file $NewSDFile...\n"; 318 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; 319 $NewSDFileRef = \*NEWSDFILE; 320 } 321 322 if ($OptionsInfo{TextOutput}) { 323 $NewTextFile = $OutputFilesInfo{TextOutFileName}; 324 print "Generating text file $NewTextFile...\n"; 325 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n"; 326 $NewTextFileRef = \*NEWTEXTFILE; 327 328 WriteTextFileCoulmnLabels(\*NEWTEXTFILE); 329 } 330 331 return ($NewSDFileRef, $NewTextFileRef); 332 } 333 334 # Write out approriate column labels to text file... 335 # 336 sub WriteTextFileCoulmnLabels { 337 my($NewTextFileRef) = @_; 338 my($Line, @LineWords); 339 340 @LineWords = (); 341 342 if ($SimilaritySearchInfo{IndividualReferenceMode}) { 343 push @LineWords, qw(ReferenceCompoundID DatabaseCompoundID ComparisonValue); 344 } 345 elsif ($SimilaritySearchInfo{MultipleReferencesMode}) { 346 push @LineWords, qw(DatabaseCompoundID ComparisonValue); 347 } 348 349 # Add columns for other database fingerprints file data to be written to output file... 350 if ($FingerprintsFilesInfo{Database}{OutputDataFields}) { 351 push @LineWords, @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}}; 352 } 353 elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) { 354 push @LineWords, @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}}; 355 } 356 357 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 358 print $NewTextFileRef "$Line\n"; 359 } 360 361 # Write molecule string data to SD output file... 362 # 363 sub WriteMolStringDataToSDOutputFile { 364 my($DatabaseCmpdID, $NewSDFileRef) = @_; 365 366 if ($FingerprintsFilesInfo{Database}{CollectCmpdStringData}) { 367 my($MolString); 368 369 ($MolString) = split /M END/, $DatabaseFingerprintsFileData{$DatabaseCmpdID}; 370 print $NewSDFileRef "$MolString\nM END\n"; 371 } 372 else { 373 # Just write out an empty molecule data string... 374 print $NewSDFileRef SDFileUtil::GenerateEmptyCtabBlockLines(), "\n"; 375 } 376 } 377 378 # Write database data from SD or Text database file to SD output file... 379 # 380 sub WriteDatabaseDataToSDOutputFile { 381 my($DatabaseCmpdID, $NewSDFileRef) = @_; 382 383 if ($FingerprintsFilesInfo{Database}{OutputDataFields}) { 384 my($DataFieldLabel, $DataFieldValue, @CmpdLines, %DataFieldLabelAndValues); 385 386 @CmpdLines = split /\n/, $DatabaseFingerprintsFileData{$DatabaseCmpdID}; 387 %DataFieldLabelAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 388 389 for $DataFieldLabel ($FingerprintsFilesInfo{Database}{OutputCurrentDataFields} ? GetCmpdDataHeaderLabels(\@CmpdLines) : @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}}) { 390 $DataFieldValue = exists $DataFieldLabelAndValues{$DataFieldLabel} ? $DataFieldLabelAndValues{$DataFieldLabel} : ''; 391 print $NewSDFileRef "> <$DataFieldLabel>\n$DataFieldValue\n\n"; 392 } 393 } 394 elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) { 395 my($DataColNum, $DataFieldLabel, $DataFieldValue); 396 397 for $DataColNum (@{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}}) { 398 $DataFieldLabel = $FingerprintsFilesInfo{Database}{DataColNumToLabelMap}{$DataColNum}; 399 $DataFieldValue = $DatabaseFingerprintsFileData{$DatabaseCmpdID}->[$DataColNum]; 400 print $NewSDFileRef "> <$DataFieldLabel>\n$DataFieldValue\n\n"; 401 } 402 } 403 } 404 405 # Retriebe database data from SD or Text database file for text output file... 406 # 407 sub RetrieveDatabaseDataForTextOutputFile { 408 my($DatabaseCmpdID) = @_; 409 410 if ($FingerprintsFilesInfo{Database}{OutputDataFields}) { 411 my(@CmpdLines, %DataFieldLabelAndValues); 412 413 @CmpdLines = split /\n/, $DatabaseFingerprintsFileData{$DatabaseCmpdID}; 414 %DataFieldLabelAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 415 416 return map { exists $DataFieldLabelAndValues{$_} ? $DataFieldLabelAndValues{$_} : ''} @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}}; 417 } 418 elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) { 419 if (exists $DatabaseFingerprintsFileData{$DatabaseCmpdID}) { 420 return map { $DatabaseFingerprintsFileData{$DatabaseCmpdID}->[$_] } (0 .. $#{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}}); 421 } 422 else { 423 return ('') x $#{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}}; 424 } 425 } 426 } 427 428 # Collect database file SD compound string or CSV/TSV data line for generating results 429 # files.. 430 # 431 sub CollectDatabaseFileData { 432 my($DatabaseCmpdID, $DatabaseFingerprintsFileIO) = @_; 433 434 if (exists $DatabaseFingerprintsFileData{$DatabaseCmpdID}) { 435 return; 436 } 437 438 if ($FingerprintsFilesInfo{Database}{CollectCmpdStringData}) { 439 $DatabaseFingerprintsFileData{$DatabaseCmpdID} = $DatabaseFingerprintsFileIO->GetCompoundString(); 440 } 441 442 if ($FingerprintsFilesInfo{Database}{CollectDataLine}) { 443 my(@DataLineWords); 444 @DataLineWords = $DatabaseFingerprintsFileIO->GetDataLineWords(); 445 $DatabaseFingerprintsFileData{$DatabaseCmpdID} = \@DataLineWords; 446 } 447 448 } 449 450 # Read fingerprints data from reference fingerprints file... 451 # 452 sub ReadReferenceFingerprintsData { 453 my($FingerprintsFileIO); 454 455 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{Reference}{FingerprintsFileIOParameters}}); 456 ($SimilaritySearchInfo{ReferenceCmpdIDsRef}, $SimilaritySearchInfo{ReferenceFingerprintsObjectsRef}) = Fingerprints::FingerprintsFileUtil::ReadAndProcessFingerpritsData($FingerprintsFileIO); 457 458 } 459 460 # Retrieve information about fingerprints files... 461 # 462 sub RetrieveFingerprintsFilesInfo { 463 464 %FingerprintsFilesInfo = (); 465 %OutputFilesInfo = (); 466 %SimilaritySearchInfo = (); 467 468 %{$FingerprintsFilesInfo{Reference}} = (); 469 %{$FingerprintsFilesInfo{Database}} = (); 470 471 # Set up reference and database file names... 472 $FingerprintsFilesInfo{Reference}{FileName} = $FingerprintsFilesList[0]; 473 $FingerprintsFilesInfo{Database}{FileName} = $FingerprintsFilesList[1]; 474 475 # Retrieve information about reference and database fingerprints file... 476 RetrieveReferenceFingerprintsFileInfo(); 477 RetrieveDatabaseFingerprintsFileInfo(); 478 479 # Setup fingerprints comparison method and associated method parameters... 480 SetupReferenceAndDatabaseFingerprintsComparisonInfo(); 481 482 # Retrieve information for output files... 483 RetrieveOutputFilesInfo(); 484 } 485 486 # Setup refrerence and database fingerprints comparison method and associated method parameters... 487 # 488 sub SetupReferenceAndDatabaseFingerprintsComparisonInfo { 489 490 # Make sure reference and database fingerprints string match... 491 if (($FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType} !~ /^$FingerprintsFilesInfo{Database}{FirstFingerprintsStringType}$/i) || 492 ($FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode} != $FingerprintsFilesInfo{Database}{FingerprintsBitVectorStringMode}) || 493 ($FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode} != $FingerprintsFilesInfo{Database}{FingerprintsVectorStringMode}) ) { 494 die "Error: First reference fingerprints string type, $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType}, must match first database fingerprints type, $FingerprintsFilesInfo{Database}{FirstFingerprintsStringType}.\n"; 495 } 496 497 if ($FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription} !~ /^$FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription}$/i) { 498 warn "Warning: First reference fingerprints string description, $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription}, doesn't match first database fingerprints string description, $FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription}.\n"; 499 } 500 501 # Setup individual reference and multiple references search mode... 502 $SimilaritySearchInfo{IndividualReferenceMode} = undef; 503 $SimilaritySearchInfo{MultipleReferencesMode} = undef; 504 505 if ($OptionsInfo{Mode} =~ /^IndividualReference$/i) { 506 $SimilaritySearchInfo{IndividualReferenceMode} = 1; 507 } 508 elsif ($OptionsInfo{Mode} =~ /^MultipleReferences$/i) { 509 $SimilaritySearchInfo{MultipleReferencesMode} = 1; 510 } 511 else { 512 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: IndividualReference, MultipleReferences\n"; 513 } 514 515 # Set up reference and database fingerprints similarity search method and paramaters... 516 my($ComparisonMeasure, $ComparisonMethod, $ApplyComparisonCutoff, $ComparisonCutoff, $KeepTop, @ComparisonMethodParameters); 517 518 $SimilaritySearchInfo{ComparisonMethod} = ''; 519 @{$SimilaritySearchInfo{ComparisonMethodParameters}} = (); 520 521 $SimilaritySearchInfo{ComparisonCutoff} = ''; 522 $SimilaritySearchInfo{KeepTop} = ''; 523 524 $ComparisonMeasure = ''; $ComparisonMethod = ''; 525 @ComparisonMethodParameters = (); 526 527 FINGERPRINTSTYPE: { 528 if ($FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode}) { 529 $ComparisonMeasure = $OptionsInfo{SpecifiedBitVectorComparisonMeasure}; 530 $ComparisonMethod = $OptionsInfo{SpecifiedBitVectorComparisonMeasureMethod}; 531 532 if ($ComparisonMeasure =~ /^TverskySimilarity$/i) { 533 push @ComparisonMethodParameters, $OptionsInfo{Alpha}; 534 } 535 elsif ($ComparisonMeasure =~ /^WeightedTverskySimilarity$/i) { 536 push @ComparisonMethodParameters, $OptionsInfo{Alpha}; 537 push @ComparisonMethodParameters, $OptionsInfo{Beta}; 538 } 539 elsif ($ComparisonMeasure =~ /^WeightedTanimotoSimilarity$/i) { 540 push @ComparisonMethodParameters, $OptionsInfo{Beta}; 541 } 542 543 last FINGERPRINTSTYPE; 544 } 545 if ($FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode}) { 546 my($SkipValuesCheck); 547 548 $ComparisonMeasure = $OptionsInfo{SpecifiedVectorComparisonMeasure}; 549 $ComparisonMethod = $OptionsInfo{SpecifiedVectorComparisonMeasuresMethod}; 550 551 push @ComparisonMethodParameters, $OptionsInfo{SpecifiedVectorComparisonMode}; 552 553 $SkipValuesCheck = $OptionsInfo{Fast} ? 1 : 0; 554 push @ComparisonMethodParameters, $SkipValuesCheck; 555 556 last FINGERPRINTSTYPE; 557 } 558 die "Error: Uknown fingerprints string type. Supported values: FingerprintsBitVectorString or FingerprintsVectorString.\n"; 559 } 560 561 $ApplyComparisonCutoff = $SimilaritySearchInfo{IndividualReferenceMode} ? 1 : (($SimilaritySearchInfo{MultipleReferencesMode} && $OptionsInfo{GroupFusionApplyCutoff}) ? 1 : 0); 562 563 $ComparisonCutoff = ''; $KeepTop = ''; 564 if ($ComparisonMethod =~ /Distance/i) { 565 $ComparisonCutoff = $OptionsInfo{DistanceCutoff}; 566 $KeepTop = ($OptionsInfo{SearchMode} =~ /^SimilaritySearch$/i) ? 0 : 1; 567 } 568 else { 569 $ComparisonCutoff = $OptionsInfo{SimilarityCutoff}; 570 $KeepTop = ($OptionsInfo{SearchMode} =~ /^SimilaritySearch$/i) ? 1 : 0; 571 } 572 573 $SimilaritySearchInfo{ComparisonMethod} = $ComparisonMethod; 574 @{$SimilaritySearchInfo{ComparisonMethodParameters}} = @ComparisonMethodParameters; 575 576 $SimilaritySearchInfo{ComparisonCutoff} = $ComparisonCutoff; 577 $SimilaritySearchInfo{KeepTop} = $KeepTop; 578 $SimilaritySearchInfo{ApplyComparisonCutoff} = $ApplyComparisonCutoff; 579 580 # Setup references to group fusion methods... 581 $SimilaritySearchInfo{GroupFusionMethodRef} = undef; 582 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = undef; 583 584 FUSIONRULE: { 585 if ($OptionsInfo{GroupFusionRule} =~ /^Max$/i) { 586 # It's always the first value in the appropriated sorted list using value of KeepTop... 587 $SimilaritySearchInfo{GroupFusionMethodRef} = sub { my($ComparisonValuesRef) = @_; return $ComparisonValuesRef->[0]; }; 588 last FUSIONRULE; 589 } 590 if ($OptionsInfo{GroupFusionRule} =~ /^Min$/i) { 591 # It's always the last value in the appropriated sorted list using value of KeepTop... 592 $SimilaritySearchInfo{GroupFusionMethodRef} = sub { my($ComparisonValuesRef) = @_; return $ComparisonValuesRef->[$#{$ComparisonValuesRef}]; }; 593 last FUSIONRULE; 594 } 595 if ($OptionsInfo{GroupFusionRule} =~ /^Mean$/i) { 596 $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Mean; 597 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1; 598 last FUSIONRULE; 599 } 600 if ($OptionsInfo{GroupFusionRule} =~ /^Median$/i) { 601 $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Median; 602 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1; 603 last FUSIONRULE; 604 } 605 if ($OptionsInfo{GroupFusionRule} =~ /^Sum$/i) { 606 $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Sum; 607 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1; 608 last FUSIONRULE; 609 } 610 if ($OptionsInfo{GroupFusionRule} =~ /^Euclidean$/i) { 611 $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Euclidean; 612 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1; 613 last FUSIONRULE; 614 } 615 die "Error: The value specified, $Options{groupfusionrule}, for option \"-g, --GroupFusionRule\" is not valid. Allowed values: Max, Min, Mean, Median, Sum, Euclidean\n"; 616 } 617 618 $SimilaritySearchInfo{UsekNN} = ($OptionsInfo{kNN} !~ /^All$/i) ? 1 : 0; 619 $SimilaritySearchInfo{SortComparisonValues} = (($OptionsInfo{GroupFusionRule} =~ /^(Max|Min)$/i) || $SimilaritySearchInfo{UsekNN}) ? 1 : 0; 620 } 621 622 # Retrieve information about reference fingerprints file... 623 # 624 sub RetrieveReferenceFingerprintsFileInfo { 625 my($FingerprintsFile, $FileType, $InDelim, $FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription); 626 627 $FingerprintsFile = $FingerprintsFilesInfo{Reference}{FileName}; 628 ($FileType, $InDelim) = RetrieveFingerprintsFileInfo($FingerprintsFile); 629 630 $FingerprintsFilesInfo{Reference}{FileType} = $FileType; 631 $FingerprintsFilesInfo{Reference}{InDelim} = $InDelim; 632 633 # Setup reference FingerprintsFileIO parameters... 634 %{$FingerprintsFilesInfo{Reference}{FingerprintsFileIOParameters}} = RetrieveFingerprintsFileIOParameters('Reference', $FileType, $FingerprintsFile); 635 636 # Make sure reference fingerprints data file contains valid and retrieve fingerprints string mode information... 637 ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription) = RetrieveFingerprintsFileFingerprintsStringInfo('Reference', $FingerprintsFile); 638 $FingerprintsFilesInfo{Reference}{FingerprintsStringMode} = $FingerprintsStringMode; 639 $FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode} = $FingerprintsBitVectorStringMode; 640 $FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode} = $FingerprintsVectorStringMode; 641 $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType} = $FirstFingerprintsStringType; 642 $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription} = $FirstFingerprintsStringDescription; 643 644 } 645 646 # Retrieve information about database fingerprints file... 647 # 648 sub RetrieveDatabaseFingerprintsFileInfo { 649 my($FingerprintsFile, $FileType, $InDelim, $FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription); 650 651 $FingerprintsFile = $FingerprintsFilesInfo{Database}{FileName}; 652 ($FileType, $InDelim) = RetrieveFingerprintsFileInfo($FingerprintsFile); 653 654 $FingerprintsFilesInfo{Database}{FileType} = $FileType; 655 $FingerprintsFilesInfo{Database}{InDelim} = $InDelim; 656 657 # Setup reference FingerprintsFileIO parameters... 658 %{$FingerprintsFilesInfo{Database}{FingerprintsFileIOParameters}} = RetrieveFingerprintsFileIOParameters('Database', $FileType, $FingerprintsFile); 659 660 # Make sure database fingerprints data file contains valid and retrieve fingerprints string mode information... 661 ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription) = RetrieveFingerprintsFileFingerprintsStringInfo('Database', $FingerprintsFile); 662 $FingerprintsFilesInfo{Database}{FingerprintsStringMode} = $FingerprintsStringMode; 663 $FingerprintsFilesInfo{Database}{FingerprintsBitVectorStringMode} = $FingerprintsBitVectorStringMode; 664 $FingerprintsFilesInfo{Database}{FingerprintsVectorStringMode} = $FingerprintsVectorStringMode; 665 $FingerprintsFilesInfo{Database}{FirstFingerprintsStringType} = $FirstFingerprintsStringType; 666 $FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription} = $FirstFingerprintsStringDescription; 667 668 # Retrieve database fingerprints data field information for output file... 669 # 670 RetrieveDatabaseFingerprintsDataFieldsInfo($FingerprintsFile, $FileType, $InDelim); 671 672 # Retrieve database fingerprints text file data columns information for output file... 673 # 674 RetrieveDatabaseFingerprintsDataColsInfo($FingerprintsFile, $FileType, $InDelim); 675 676 # Any need to collect database compound string or data line for generation of results files... 677 $FingerprintsFilesInfo{Database}{CollectCmpdStringData} = ($FileType =~ /^SD$/i) ? 1 : 0; 678 $FingerprintsFilesInfo{Database}{CollectDataLine} = ($FileType =~ /^Text$/i && $OptionsInfo{DatabaseDataColsMode} =~ /^(All|Specify)$/i) ? 1 : 0; 679 $FingerprintsFilesInfo{Database}{CollectInputFileData} = ($FingerprintsFilesInfo{Database}{CollectCmpdStringData} || $FingerprintsFilesInfo{Database}{CollectDataLine}) ? 1 : 0; 680 681 # Set maximum number of similar compounds to find for individual reference of set of multiple 682 # reference compounds... 683 # 684 SetMaximumSimilarMoleculesToRetrieve($FingerprintsFile, $FileType, $InDelim); 685 } 686 687 # Retrieve database fingerprints data field information... 688 # 689 sub RetrieveDatabaseFingerprintsDataFieldsInfo { 690 my($FingerprintsFile, $FileType, $InDelim) = @_; 691 my($CollectDataFields, $CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef, @DataFieldsToOutput); 692 693 $FingerprintsFilesInfo{Database}{OutputDataFields} = 0; 694 @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}} = (); 695 696 $FingerprintsFilesInfo{Database}{OutputCurrentDataFields} = 0; 697 698 @{$FingerprintsFilesInfo{Database}{AllDataFields}} = (); 699 @{$FingerprintsFilesInfo{Database}{CommonDataFields}} = (); 700 @{$FingerprintsFilesInfo{Database}{SpecifiedDatabaseDataFields}} = (); 701 702 if ($FileType !~ /^SD$/i) { 703 return; 704 } 705 706 # No need to go over SD file and collect data fields for SD file during All DatabaseDataFieldsMode as 707 # they would be retrieved from database SD file compound string during generation of output files... 708 # 709 $CollectDataFields = (($OptionsInfo{TextOutput} && $OptionsInfo{DatabaseDataFieldsMode} =~ /^(All|Common)$/i) || ($OptionsInfo{SDOutput} && $OptionsInfo{DatabaseDataFieldsMode} =~ /^Common$/i)) ? 1 : 0; 710 711 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = (undef) x 2; 712 713 if ($CollectDataFields) { 714 open SDFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n"; 715 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 716 close SDFILE; 717 } 718 719 @DataFieldsToOutput = (); 720 if ($OptionsInfo{DatabaseDataFieldsMode} =~ /^All$/i) { 721 if (defined $AllDataFieldsRef) { 722 push @DataFieldsToOutput, @{$AllDataFieldsRef}; 723 push @{$FingerprintsFilesInfo{Database}{AllDataFields}}, @{$AllDataFieldsRef}; 724 } 725 else { 726 # Retrieve and output data fields and values dynamically... 727 $FingerprintsFilesInfo{Database}{OutputCurrentDataFields} = 1; 728 } 729 } 730 elsif ($OptionsInfo{DatabaseDataFieldsMode} =~ /^Common$/i) { 731 if (defined $CommonDataFieldsRef) { 732 push @DataFieldsToOutput, @{$CommonDataFieldsRef}; 733 push @{$FingerprintsFilesInfo{Database}{CommonDataFields}}, @{$CommonDataFieldsRef}; 734 } 735 } 736 elsif ($OptionsInfo{DatabaseDataFieldsMode} =~ /^Specify$/i) { 737 push @DataFieldsToOutput, @{$OptionsInfo{SpecifiedDatabaseDataFields}}; 738 push @{$FingerprintsFilesInfo{Database}{SpecifiedDatabaseDataFields}}, @{$OptionsInfo{SpecifiedDatabaseDataFields}}; 739 } 740 741 if ($OptionsInfo{DatabaseDataFieldsMode} !~ /^CompoundID$/i) { 742 $FingerprintsFilesInfo{Database}{OutputDataFields} = 1; 743 } 744 745 push @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}}, @DataFieldsToOutput; 746 747 } 748 749 # Retrieve database fingerprints data columns information... 750 # 751 sub RetrieveDatabaseFingerprintsDataColsInfo { 752 my($FingerprintsFile, $FileType, $InDelim) = @_; 753 my($Line, $ColNum, $ColLabel, $NumOfCols, @DataColLabels, @DataColLabelsToOutput, @DataColNumsToOutput, %DataColLabelToNumMap, %DataColNumToLabelMap); 754 755 $FingerprintsFilesInfo{Database}{OutputDataCols} = 0; 756 757 @{$FingerprintsFilesInfo{Database}{DataColLabels}} = (); 758 %{$FingerprintsFilesInfo{Database}{DataColLabelToNumMap}} = (); 759 %{$FingerprintsFilesInfo{Database}{DataColNumToLabelMap}} = (); 760 761 @{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}} = (); 762 @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}} = (); 763 764 if ($FileType !~ /^Text$/i) { 765 return; 766 } 767 768 @DataColLabels = (); 769 @DataColLabelsToOutput = (); 770 @DataColNumsToOutput = (); 771 772 %DataColLabelToNumMap = (); 773 %DataColNumToLabelMap = (); 774 775 # Get column label line... 776 open TEXTFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n"; 777 $Line = TextUtil::GetTextLine(\*TEXTFILE); 778 close TEXTFILE; 779 780 $InDelim = ($InDelim =~ /^Tab$/i) ? "\t" : ($InDelim =~ /semicolon/i ? "\;" : "\,"); 781 782 @DataColLabels = TextUtil::SplitWords($Line, $InDelim); 783 $NumOfCols = scalar @DataColLabels; 784 785 for $ColNum (0 .. $#DataColLabels) { 786 $ColLabel = $DataColLabels[$ColNum]; 787 $DataColLabelToNumMap{$ColLabel} = $ColNum; 788 $DataColNumToLabelMap{$ColNum} = $ColLabel; 789 } 790 791 if ($OptionsInfo{DatabaseDataColsMode} =~ /^Specify$/i) { 792 if ($OptionsInfo{DatabaseColMode} =~ /^ColNum$/i) { 793 for $ColNum (@{$OptionsInfo{SpecifiedDatabaseDataCols}}) { 794 if ($ColNum > $NumOfCols) { 795 die "Error: Column number, $ColNum, specified using \"--DatabaseDataCols\" is not valid: It must be <= $NumOfCols\n"; 796 } 797 push @DataColNumsToOutput, ($ColNum - 1); 798 } 799 } 800 elsif ($OptionsInfo{DatabaseColMode} =~ /^ColLabel$/i) { 801 for $ColLabel (@{$OptionsInfo{SpecifiedDatabaseDataCols}}) { 802 if (!exists $DataColLabelToNumMap{$ColLabel}) { 803 die "Error: Column label, $ColLabel, specified using \"--DatabaseDataCols\" is not valid: It doesn't exist\n"; 804 } 805 push @DataColNumsToOutput, $DataColLabelToNumMap{$ColLabel}; 806 } 807 } 808 } 809 elsif ($OptionsInfo{DatabaseDataColsMode} =~ /^All$/i) { 810 @DataColNumsToOutput = map { $_ } (0 .. $#DataColLabels); 811 } 812 813 # Setup data column labels to output... 814 if (scalar @DataColNumsToOutput) { 815 @DataColLabelsToOutput = map { $DataColNumToLabelMap{$_} } (0 .. $#DataColNumsToOutput); 816 } 817 818 $FingerprintsFilesInfo{Database}{OutputDataCols} = scalar @DataColNumsToOutput ? 1 : 0; 819 820 @{$FingerprintsFilesInfo{Database}{DataColLabels}} = @DataColLabels; 821 %{$FingerprintsFilesInfo{Database}{DataColLabelToNumMap}} = %DataColLabelToNumMap; 822 %{$FingerprintsFilesInfo{Database}{DataColNumToLabelMap}} = %DataColNumToLabelMap; 823 824 @{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}} = @DataColNumsToOutput; 825 @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}} = @DataColLabelsToOutput; 826 } 827 828 # Set maximum number of similar compounds to find for individual reference of set of multiple 829 # reference compounds... 830 # 831 sub SetMaximumSimilarMoleculesToRetrieve { 832 my($FingerprintsFile, $FileType, $InDelim) = @_; 833 my($MaxSimilarMolecules, $NumOfDatabaseMolecules, $PercentSimilarMolecules, $Line); 834 835 if ($OptionsInfo{SimilarCountMode} !~ /^PercentSimilar$/i) { 836 return; 837 } 838 839 $PercentSimilarMolecules = $OptionsInfo{PercentSimilarMolecules}; 840 841 # Count database entries to figure out MaxSimilarMolecules using PercentSimilarMolecules 842 # value... 843 $NumOfDatabaseMolecules = 0; 844 if ($FileType =~ /^SD$/i && exists($FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules})) { 845 # It might already be counted for SD file... 846 $NumOfDatabaseMolecules = $FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules}; 847 } 848 else { 849 print "Calculating maximum number of similar molecules to retrieve for \"PercentSimilar\" value of \"--SimilarCountMode\" option by counting number of molecules in database fingerprints file...\n"; 850 open FINGERPRINTSFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n"; 851 FILETYPE: { 852 if ($FileType =~ /^SD$/i) { 853 while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) { 854 if ($Line =~ /^\$\$\$\$/) { 855 $NumOfDatabaseMolecules++; 856 } 857 } 858 last FILETYPE; 859 } 860 if ($FileType =~ /^Text$/i) { 861 # Ignore column label line... 862 $Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE); 863 while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) { 864 $NumOfDatabaseMolecules++; 865 } 866 last FILETYPE; 867 } 868 if ($FileType =~ /^FP$/i) { 869 while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) { 870 if ($Line !~ /^#/) { 871 $NumOfDatabaseMolecules++; 872 } 873 } 874 last FILETYPE; 875 } 876 $NumOfDatabaseMolecules = 0; 877 } 878 close FINGERPRINTSFILE; 879 $FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules} = $NumOfDatabaseMolecules; 880 } 881 882 $MaxSimilarMolecules = int (($NumOfDatabaseMolecules * $PercentSimilarMolecules)/100); 883 if ($MaxSimilarMolecules < 1) { 884 $MaxSimilarMolecules = 1; 885 } 886 887 $OptionsInfo{MaxSimilarMolecules} = $MaxSimilarMolecules; 888 } 889 890 # Retrieve information about fingerprints file... 891 # 892 sub RetrieveFingerprintsFileInfo { 893 my($FingerprintsFile) = @_; 894 my($FileType, $InDelim, $FileDir, $FileExt, $FileName); 895 896 if (!(-e $FingerprintsFile)) { 897 die "Error: Input fingerprints file, $FingerprintsFile, doesn't exist.\n"; 898 } 899 900 $FileType = Fingerprints::FingerprintsFileUtil::GetFingerprintsFileType($FingerprintsFile); 901 if (IsEmpty($FileType)) { 902 die "Error: Input file, $FingerprintsFile, is not a fingerprints file.\n"; 903 } 904 905 $InDelim = ''; 906 if ($FileType =~ /^Text$/i) { 907 $FileDir = ""; $FileName = ""; $FileExt = ""; 908 ($FileDir, $FileName, $FileExt) = ParseFileName($FingerprintsFile); 909 $InDelim = ($FileExt =~ /^tsv$/i) ? 'Tab' : $OptionsInfo{InDelim}; 910 } 911 912 return ($FileType, $InDelim); 913 } 914 915 # Retrieve fingerprints file IO parameters... 916 # 917 sub RetrieveFingerprintsFileIOParameters { 918 my($FingerprintsFileMode, $FileType, $FingerprintsFile) = @_; 919 my(%FingerprintsFileIOParams); 920 921 if ($FingerprintsFileMode !~ /^(Reference|Database)$/) { 922 die "Error: Unknown fingerprints file mode: $FingerprintsFileMode. Supported values: Reference or Database\n"; 923 } 924 925 %FingerprintsFileIOParams = (); 926 927 FILETYPE: { 928 if ($FileType =~ /^SD$/i) { 929 %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}, 'FingerprintsFieldLabel' => $OptionsInfo{"${FingerprintsFileMode}FingerprintsField"}, 'CompoundIDMode' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDMode"}, 'CompoundIDFieldLabel' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDField"}, 'CompoundIDPrefix' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDPrefix"}); 930 last FILETYPE; 931 } 932 if ($FileType =~ /^FP$/i) { 933 %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}); 934 last FILETYPE; 935 } 936 if ($FileType =~ /^Text$/i) { 937 %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}, 'FingerprintsCol' => $OptionsInfo{"${FingerprintsFileMode}FingerprintsCol"}, 'ColMode' => $OptionsInfo{"${FingerprintsFileMode}ColMode"}, 'CompoundIDCol' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDCol"}, 'CompoundIDPrefix' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDPrefix"}, 'InDelim' => $FingerprintsFilesInfo{$FingerprintsFileMode}{InDelim}); 938 last FILETYPE; 939 } 940 die "Error: Fingerprints file type, $FileType, is not valid. Supported file types: SD, FP or Text\n"; 941 } 942 943 return %FingerprintsFileIOParams; 944 } 945 946 # Make sure fingerprints data file contains valid dta and retrieve fingerprints string mode information... 947 # 948 sub RetrieveFingerprintsFileFingerprintsStringInfo { 949 my($FingerprintsFileMode, $FingerprintsFile) = @_; 950 my($FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription); 951 952 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{$FingerprintsFileMode}{FingerprintsFileIOParameters}}); 953 if (!$FingerprintsFileIO) { 954 die "Error: Reference fingerprints file, $FingerprintsFile, contains invalid fingerprints data.\n"; 955 } 956 if (!$FingerprintsFileIO->IsFingerprintsFileDataValid()) { 957 die "Error: Reference fingerprints file, $FingerprintsFile, contains invalid fingerprints data.\n"; 958 } 959 960 $FingerprintsStringMode = $FingerprintsFileIO->GetFingerprintsStringMode(); 961 $FingerprintsBitVectorStringMode = $FingerprintsFileIO->GetFingerprintsBitVectorStringMode(); 962 $FingerprintsVectorStringMode = $FingerprintsFileIO->GetFingerprintsVectorStringMode(); 963 964 $FirstFingerprintsStringType = $FingerprintsFileIO->GetFirstFingerprintsStringType(); 965 $FirstFingerprintsStringDescription = $FingerprintsFileIO->GetFirstFingerprintsStringDescription(); 966 967 $FingerprintsFileIO->Close(); 968 969 return ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription); 970 } 971 972 # Retrieve output files names using reference fingerprints file name... 973 # 974 sub RetrieveOutputFilesInfo { 975 my($FingerprintsFile, $FileDir, $FileExt, $FileName, $OutFileRoot, $SDOutFileName, $TextOutFileName, $SDOutFileExt, $TextOutFileExt, $ReferenceFileName, $DatabaseFileName); 976 977 $OutputFilesInfo{OutFileRoot} = ''; 978 $OutputFilesInfo{SDOutFileName} = ''; 979 $OutputFilesInfo{TextOutFileName} = ''; 980 981 $FingerprintsFile = $FingerprintsFilesInfo{Reference}{FileName}; 982 983 $FileDir = ""; $FileName = ""; $FileExt = ""; 984 ($FileDir, $FileName, $FileExt) = ParseFileName($FingerprintsFile); 985 986 $SDOutFileExt = "sdf"; 987 $TextOutFileExt = ($Options{outdelim} =~ /^tab$/i) ? "tsv" : "csv"; 988 989 if ($OptionsInfo{OutFileRoot}) { 990 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 991 if ($RootFileName && $RootFileExt) { 992 $FileName = $RootFileName; 993 } 994 else { 995 $FileName = $OptionsInfo{OutFileRoot}; 996 } 997 $OutFileRoot = $FileName; 998 } 999 else { 1000 $OutFileRoot = "${FileName}SimilaritySearching"; 1001 } 1002 1003 $SDOutFileName = "${OutFileRoot}.${SDOutFileExt}"; 1004 $TextOutFileName = "${OutFileRoot}.${TextOutFileExt}"; 1005 1006 $ReferenceFileName = $FingerprintsFilesInfo{Reference}{FileName}; 1007 $DatabaseFileName = $FingerprintsFilesInfo{Database}{FileName}; 1008 1009 if ($OptionsInfo{SDOutput}) { 1010 if ($SDOutFileName =~ /^$ReferenceFileName$/i) { 1011 die "Error: Same output, $SDOutFileName, and reference input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n"; 1012 } 1013 if ($SDOutFileName =~ /^$DatabaseFileName$/i) { 1014 die "Error: Same output, $SDOutFileName, and database input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n"; 1015 } 1016 } 1017 1018 if ($OptionsInfo{TextOutput}) { 1019 if ($TextOutFileName =~ /^$ReferenceFileName$/i) { 1020 die "Error: Same output, $TextOutFileName, and reference input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n"; 1021 } 1022 if ($TextOutFileName =~ /^$DatabaseFileName$/i) { 1023 die "Error: Same output, $TextOutFileName, and database input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n"; 1024 } 1025 } 1026 1027 if (!$OptionsInfo{OverwriteFiles}) { 1028 if ($OptionsInfo{SDOutput}) { 1029 if (-e $SDOutFileName) { 1030 die "Error: The output file $SDOutFileName already exists.\n"; 1031 } 1032 } 1033 if ($OptionsInfo{TextOutput}) { 1034 if (-e $TextOutFileName) { 1035 die "Error: The output file $TextOutFileName already exists.\n"; 1036 } 1037 } 1038 } 1039 1040 $OutputFilesInfo{OutFileRoot} = $OutFileRoot; 1041 $OutputFilesInfo{SDOutFileName} = $SDOutFileName; 1042 $OutputFilesInfo{TextOutFileName} = $TextOutFileName; 1043 1044 } 1045 1046 # Process input fingerprints file names... 1047 # 1048 sub ProcessFingerprintsFileNames { 1049 @FingerprintsFilesList = (); 1050 1051 if (@ARGV != 2) { 1052 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 1053 } 1054 1055 # Reference fingerprints file name... 1056 push @FingerprintsFilesList, $ARGV[0]; 1057 1058 # Database fingerprints file name... 1059 push @FingerprintsFilesList, $ARGV[1]; 1060 1061 } 1062 1063 # Process option values... 1064 sub ProcessOptions { 1065 %OptionsInfo = (); 1066 1067 $OptionsInfo{Mode} = $Options{mode}; 1068 $OptionsInfo{FingerprintsMode} = $Options{fingerprintsmode}; 1069 1070 $OptionsInfo{SearchMode} = $Options{searchmode}; 1071 1072 ProcessBitVectorComparisonOptions(); 1073 ProcessVectorComparisonOptions(); 1074 1075 $OptionsInfo{GroupFusionRule} = $Options{groupfusionrule}; 1076 $OptionsInfo{GroupFusionApplyCutoff} = ($Options{groupfusionapplycutoff} =~ /^Yes$/i) ? 1 : 0;; 1077 1078 $OptionsInfo{SimilarCountMode} = $Options{similarcountmode}; 1079 $OptionsInfo{NumOfSimilarMolecules} = $Options{numofsimilarmolecules}; 1080 $OptionsInfo{PercentSimilarMolecules} = $Options{percentsimilarmolecules}; 1081 1082 # Set MaxSimilarMolecules to NumOfSimilarMolecules. For PercentSimilar value of SimilarCountMode, 1083 # it'll be overwritten using number of entries in database fingerprints file and value of PercentSimilarMolecules... 1084 # 1085 $OptionsInfo{MaxSimilarMolecules} = $OptionsInfo{NumOfSimilarMolecules}; 1086 1087 $OptionsInfo{SimilarityCutoff} = $Options{similaritycutoff}; 1088 $OptionsInfo{DistanceCutoff} = $Options{distancecutoff}; 1089 1090 $OptionsInfo{kNN} = $Options{knn}; 1091 if ($Options{knn} !~ /^All$/i) { 1092 if (!IsPositiveInteger($Options{knn})) { 1093 die "Error: The value specified, $Options{knn}, for option \"-k, --KNN\" is not valid. Allowed values: > 0 \n"; 1094 } 1095 } 1096 1097 ProcessReferenceFingerprintsDataOptions(); 1098 ProcessDatabaseFingerprintsDataOptions(); 1099 1100 $OptionsInfo{Detail} = $Options{detail}; 1101 1102 $OptionsInfo{InDelim} = $Options{indelim}; 1103 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 1104 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; 1105 1106 $OptionsInfo{Output} = $Options{output}; 1107 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|Both)$/i) ? 1 : 0; 1108 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|Both)$/i) ? 1 : 0; 1109 1110 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 1111 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 1112 1113 $OptionsInfo{Fast} = $Options{fast} ? 1 : 0; 1114 $OptionsInfo{ValidateData} = $Options{fast} ? 0 : 1; 1115 1116 $OptionsInfo{Precision} = $Options{precision}; 1117 } 1118 1119 # Process options related to comparion of bit vector strings... 1120 # 1121 sub ProcessBitVectorComparisonOptions { 1122 # Setup supported bit vector similarity coefficients for bit vector strings... 1123 my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap); 1124 1125 @SupportedComparisonMeasures = (); 1126 %SupportedComparisonMeasuresNameMap = (); 1127 %SupportedComparisonMeasuresMethodMap = (); 1128 1129 for $SupportedComparisonMeasure (Fingerprints::FingerprintsBitVector::GetSupportedSimilarityCoefficients()) { 1130 # Similarity coefficient function/method names contain "Coefficient" in their names. 1131 # So take 'em out and setup a map to original function/method name... 1132 $ComparisonMeasure = $SupportedComparisonMeasure; 1133 $ComparisonMeasure =~ s/Coefficient$//; 1134 1135 push @SupportedComparisonMeasures, $ComparisonMeasure; 1136 $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure; 1137 $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure; 1138 } 1139 1140 # Setup similarity coefficient to use for calculating similarity matrices for bit vector strings... 1141 my($SpecifiedMeasure, $SpecifiedComparisonMeasureName, $SpecifiedComparisonMeasureMethod); 1142 1143 $SpecifiedComparisonMeasureName = ''; 1144 $SpecifiedComparisonMeasureMethod = ''; 1145 1146 $SpecifiedMeasure = $Options{bitvectorcomparisonmode}; 1147 1148 if (! exists $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)} ) { 1149 die "Error: The value specified, $SpecifiedMeasure, for option \"-b --BitVectorComparisonMode\" is not valid.\nAllowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n"; 1150 } 1151 1152 $SpecifiedComparisonMeasureMethod = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)}; 1153 $SpecifiedComparisonMeasureName = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)}; 1154 1155 $OptionsInfo{BitVectorComparisonMode} = $Options{bitvectorcomparisonmode}; 1156 1157 $OptionsInfo{SpecifiedBitVectorComparisonMeasure} = $SpecifiedMeasure; 1158 $OptionsInfo{SpecifiedBitVectorComparisonMeasureName} = $SpecifiedComparisonMeasureName; 1159 $OptionsInfo{SpecifiedBitVectorComparisonMeasureMethod} = $SpecifiedComparisonMeasureMethod; 1160 1161 # Make sure valid alpha parameter is specified for Tversky calculation... 1162 $OptionsInfo{Alpha} = ''; 1163 if ($SpecifiedMeasure =~ /^(TverskySimilarity|WeightedTverskySimilarity)$/i) { 1164 if (IsEmpty($Options{alpha})) { 1165 die "Error: You must specify a value for \"-a, --alpha\" option in \"TverskySimilarity or WeightedTverskySimilarity\" \"-m --mode\". \n"; 1166 } 1167 my($Alpha); 1168 $Alpha = $Options{alpha}; 1169 if (!(IsFloat($Alpha) && $Alpha >=0 && $Alpha <= 1)) { 1170 die "Error: The value specified, $Options{alpha}, for option \"-a, --alpha\" is not valid. Allowed values: >= 0 and <= 1\n"; 1171 } 1172 $OptionsInfo{Alpha} = $Alpha; 1173 } 1174 1175 # Make sure valid beta parameter is specified for WeightedTanimoto and WeightedTversky 1176 # calculations... 1177 $OptionsInfo{Beta} = ''; 1178 if ($SpecifiedMeasure =~ /^(WeightedTverskySimilarity|WeightedTanimotoSimilarity)$/i) { 1179 if (IsEmpty($Options{beta})) { 1180 die "Error: You must specify a value for \"-b, --beta\" option in \"WeightedTverskySimilarity or WeightedTanimotoSimilarity\" \"-m --mode\". \n"; 1181 } 1182 my($Beta); 1183 $Beta = $Options{beta}; 1184 if (!(IsFloat($Beta) && $Beta >=0 && $Beta <= 1)) { 1185 die "Error: The value specified, $Options{beta}, for option \"-b, --beta\" is not valid. Allowed values: >= 0 and <= 1\n"; 1186 } 1187 $OptionsInfo{Beta} = $Beta; 1188 } 1189 } 1190 1191 # Process options related to comparion of vector strings... 1192 # 1193 sub ProcessVectorComparisonOptions { 1194 # Setup specified similarity coefficients for vector strings.. 1195 my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap); 1196 1197 @SupportedComparisonMeasures = (); 1198 %SupportedComparisonMeasuresNameMap = (); 1199 %SupportedComparisonMeasuresMethodMap = (); 1200 for $SupportedComparisonMeasure (Fingerprints::FingerprintsVector::GetSupportedDistanceAndSimilarityCoefficients()) { 1201 # Similarity and distance coefficient function/method names contain "Coefficient" in their names. 1202 # So take 'em out and setup a map to original function/method name... 1203 $ComparisonMeasure = $SupportedComparisonMeasure; 1204 if ($ComparisonMeasure =~ /Coefficient$/i) { 1205 $ComparisonMeasure =~ s/Coefficient$//i; 1206 } 1207 push @SupportedComparisonMeasures, $ComparisonMeasure; 1208 $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure; 1209 $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure; 1210 } 1211 1212 # Setup a list of similarity coefficients to use for calculating similarity matrices for bit vector strings... 1213 my($SpecifiedMeasure, $SpecifiedComparisonMeasureName, $SpecifiedComparisonMeasureMethod); 1214 1215 $SpecifiedComparisonMeasureName = ''; 1216 $SpecifiedComparisonMeasureMethod = ''; 1217 1218 $SpecifiedMeasure = $Options{vectorcomparisonmode}; 1219 $SpecifiedMeasure =~ s/ //g; 1220 1221 if (! exists($SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)})) { 1222 die "Error: The value specified, $SpecifiedMeasure, for option \"-v --VectorComparisonMode\" is not valid.\nAllowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n"; 1223 } 1224 1225 $SpecifiedComparisonMeasureMethod = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)}; 1226 $SpecifiedComparisonMeasureName = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)}; 1227 1228 $OptionsInfo{VectorComparisonMode} = $Options{vectorcomparisonmode}; 1229 1230 $OptionsInfo{SpecifiedVectorComparisonMeasure} = $SpecifiedMeasure; 1231 $OptionsInfo{SpecifiedVectorComparisonMeasuresName} = $SpecifiedComparisonMeasureName; 1232 $OptionsInfo{SpecifiedVectorComparisonMeasuresMethod} = $SpecifiedComparisonMeasureMethod; 1233 1234 # Setup specified vector comparison calculation modes... 1235 my($SpecifiedFormulism); 1236 1237 $SpecifiedFormulism = $Options{vectorcomparisonformulism}; 1238 $SpecifiedFormulism =~ s/ //g; 1239 if ($SpecifiedFormulism !~ /^(AlgebraicForm|BinaryForm|SetTheoreticForm)$/i) { 1240 die "Error: The value specified, $SpecifiedFormulism, for option \"--VectorComparisonFormulism\" is not valid. Allowed values: AlgebraicForm, BinaryForm or SetTheoreticForm\n"; 1241 } 1242 1243 $OptionsInfo{VectorComparisonFormulism} = $Options{vectorcomparisonformulism}; 1244 $OptionsInfo{SpecifiedVectorComparisonMode} = $SpecifiedFormulism; 1245 1246 } 1247 1248 # Process options related to data retrieval from reference fingerprints SD and CSV/TSV 1249 # text files... 1250 # 1251 sub ProcessReferenceFingerprintsDataOptions { 1252 1253 $OptionsInfo{ReferenceCompoundIDPrefix} = $Options{referencecompoundidprefix} ? $Options{referencecompoundidprefix} : 'Cmpd'; 1254 1255 # Compound ID and fingerprints column options for text files... 1256 1257 $OptionsInfo{ReferenceColMode} = $Options{referencecolmode}; 1258 1259 if (IsNotEmpty($Options{referencecompoundidcol})) { 1260 if ($Options{referencecolmode} =~ /^ColNum$/i) { 1261 if (!IsPositiveInteger($Options{referencecompoundidcol})) { 1262 die "Error: Column value, $Options{referencecompoundidcol}, specified using \"--ReferenceCompoundIDCol\" is not valid: Allowed integer values: > 0\n"; 1263 } 1264 } 1265 $OptionsInfo{ReferenceCompoundIDCol} = $Options{referencecompoundidcol}; 1266 } 1267 else { 1268 $OptionsInfo{ReferenceCompoundIDCol} = 'AutoDetect'; 1269 } 1270 1271 if (IsNotEmpty($Options{referencefingerprintscol})) { 1272 if ($Options{referencecolmode} =~ /^ColNum$/i) { 1273 if (!IsPositiveInteger($Options{referencefingerprintscol})) { 1274 die "Error: Column value, $Options{referencefingerprintscol}, specified using \"--ReferenceFingerprintsCol\" is not valid: Allowed integer values: > 0\n"; 1275 } 1276 } 1277 $OptionsInfo{ReferenceFingerprintsCol} = $Options{referencefingerprintscol}; 1278 } 1279 else { 1280 $OptionsInfo{ReferenceFingerprintsCol} = 'AutoDetect'; 1281 } 1282 1283 if (IsNotEmpty($Options{referencecompoundidcol}) && IsNotEmpty($Options{referencefingerprintscol})) { 1284 if (IsPositiveInteger($Options{referencecompoundidcol}) && IsPositiveInteger($Options{referencefingerprintscol})) { 1285 if (($Options{referencecompoundidcol} == $Options{referencefingerprintscol})) { 1286 die "Error: Values specified using \"--ReferenceCompoundIDCol\" and \"--ReferenceFingerprintsCol\", $Options{referencecompoundidcol}, must be different.\n"; 1287 } 1288 } 1289 else { 1290 if (($Options{referencecompoundidcol} eq $Options{referencefingerprintscol})) { 1291 die "Error: Values specified using \"--ReferenceCompoundIDCol\" and \"--ReferenceFingerprintsCol\", $Options{referencecompoundidcol}, must be different.\n"; 1292 } 1293 } 1294 } 1295 1296 # Compound ID and fingerprints field options for SD files... 1297 1298 $OptionsInfo{ReferenceCompoundIDMode} = $Options{referencecompoundidmode}; 1299 $OptionsInfo{ReferenceCompoundIDField} = ''; 1300 1301 if ($Options{referencecompoundidmode} =~ /^DataField$/i && !$Options{referencecompoundidfield}) { 1302 die "Error: You must specify a value for \"--ReferenceCompoundIDField\" option in \"DataField\" \"--ReferenceCompoundIDMode\". \n"; 1303 } 1304 if ($Options{referencecompoundidfield}) { 1305 $OptionsInfo{ReferenceCompoundIDField} = $Options{referencecompoundidfield}; 1306 } 1307 1308 if (IsNotEmpty($Options{referencefingerprintsfield})) { 1309 $OptionsInfo{ReferenceFingerprintsField} = $Options{referencefingerprintsfield}; 1310 } 1311 else { 1312 $OptionsInfo{ReferenceFingerprintsField} = 'AutoDetect'; 1313 } 1314 1315 if ($Options{referencecompoundidfield} && IsNotEmpty($Options{referencefingerprintsfield})) { 1316 if (($Options{referencecompoundidfield} eq $Options{referencefingerprintsfield})) { 1317 die "Error: Values specified using \"--ReferenceCompoundIDField\" and \"--ReferenceFingerprintsfield\", $Options{referencecompoundidfield}, must be different.\n"; 1318 } 1319 } 1320 1321 } 1322 1323 # Process options related to data retrieval from database fingerprints SD and CSV/TSV 1324 # text files... 1325 # 1326 sub ProcessDatabaseFingerprintsDataOptions { 1327 1328 $OptionsInfo{DatabaseCompoundIDPrefix} = $Options{databasecompoundidprefix} ? $Options{databasecompoundidprefix} : 'Cmpd'; 1329 1330 # Compound ID and fingerprints column options for text files... 1331 1332 $OptionsInfo{DatabaseColMode} = $Options{databasecolmode}; 1333 1334 if (IsNotEmpty($Options{databasecompoundidcol})) { 1335 if ($Options{databasecolmode} =~ /^ColNum$/i) { 1336 if (!IsPositiveInteger($Options{databasecompoundidcol})) { 1337 die "Error: Column value, $Options{databasecompoundidcol}, specified using \"--DatabaseCompoundIDCol\" is not valid: Allowed integer values: > 0\n"; 1338 } 1339 } 1340 $OptionsInfo{DatabaseCompoundIDCol} = $Options{databasecompoundidcol}; 1341 } 1342 else { 1343 $OptionsInfo{DatabaseCompoundIDCol} = 'AutoDetect'; 1344 } 1345 1346 if (IsNotEmpty($Options{databasefingerprintscol})) { 1347 if ($Options{databasecolmode} =~ /^ColNum$/i) { 1348 if (!IsPositiveInteger($Options{databasefingerprintscol})) { 1349 die "Error: Column value, $Options{databasefingerprintscol}, specified using \"--DatabaseFingerprintsCol\" is not valid: Allowed integer values: > 0\n"; 1350 } 1351 } 1352 $OptionsInfo{DatabaseFingerprintsCol} = $Options{databasefingerprintscol}; 1353 } 1354 else { 1355 $OptionsInfo{DatabaseFingerprintsCol} = 'AutoDetect'; 1356 } 1357 1358 if (IsNotEmpty($Options{databasecompoundidcol}) && IsNotEmpty($Options{databasefingerprintscol})) { 1359 if (IsPositiveInteger($Options{databasecompoundidcol}) && IsPositiveInteger($Options{databasefingerprintscol})) { 1360 if (($Options{databasecompoundidcol} == $Options{databasefingerprintscol})) { 1361 die "Error: Values specified using \"--DatabaseCompoundIDCol\" and \"--DatabaseFingerprintsCol\", $Options{databasecompoundidcol}, must be different.\n"; 1362 } 1363 } 1364 else { 1365 if (($Options{databasecompoundidcol} eq $Options{databasefingerprintscol})) { 1366 die "Error: Values specified using \"--DatabaseCompoundIDCol\" and \"--DatabaseFingerprintsCol\", $Options{databasecompoundidcol}, must be different.\n"; 1367 } 1368 } 1369 } 1370 1371 # Database data column options for text files... 1372 1373 $OptionsInfo{DatabaseDataColsMode} = $Options{databasedatacolsmode}; 1374 $OptionsInfo{DatabaseDataCols} = ''; 1375 @{$OptionsInfo{SpecifiedDatabaseDataCols}} = (); 1376 1377 if ($Options{databasedatacolsmode} =~ /^Specify$/i) { 1378 my($DatabaseDataCols, $DatabaseColNum, @SpecifiedDataCols); 1379 1380 if (!$Options{databasedatacols}) { 1381 die "Error: You must specify a value for \"--DatabaseDataCols\" option in \"Specify\" \"--DatabaseDataColsMode\". \n"; 1382 } 1383 $DatabaseDataCols = $Options{databasedatacols}; 1384 1385 if ($Options{databasecolmode} =~ /^ColNum$/i) { 1386 $DatabaseDataCols =~ s/ //g; 1387 @SpecifiedDataCols = split /\,/, $DatabaseDataCols; 1388 for $DatabaseColNum (@SpecifiedDataCols) { 1389 if (!IsPositiveInteger($DatabaseColNum)) { 1390 die "Error: Column value, $DatabaseColNum, specified using \"--DatabaseDataCols\" is not valid: Allowed integer values: > 0\n"; 1391 } 1392 } 1393 } 1394 else { 1395 @SpecifiedDataCols = split /\,/, $DatabaseDataCols; 1396 } 1397 $OptionsInfo{DatabaseDataCols} = $DatabaseDataCols; 1398 push @{$OptionsInfo{SpecifiedDatabaseDataCols}}, @SpecifiedDataCols; 1399 } 1400 elsif ($Options{databasedatacolsmode} =~ /^All$/i) { 1401 $OptionsInfo{DatabaseDataCols} = 'All'; 1402 } 1403 1404 if ($OptionsInfo{DatabaseDataColsMode} =~ /^Specify$/i && !$OptionsInfo{DatabaseDataCols}) { 1405 die "Error: You must specify a value for \"--DatabaseDataCols\" option in \"Specify\" \"--DatabaseDataColsMode\". \n"; 1406 } 1407 1408 # Compound ID and fingerprints field options for SD files... 1409 1410 $OptionsInfo{DatabaseCompoundIDMode} = $Options{databasecompoundidmode}; 1411 $OptionsInfo{DatabaseCompoundIDField} = $Options{databasecompoundidfield} ? $Options{databasecompoundidfield} : ''; 1412 1413 if ($Options{databasecompoundidmode} =~ /^DataField$/i) { 1414 if (!$Options{databasecompoundidfield}) { 1415 die "Error: You must specify a value for \"--DatabaseCompoundIDField\" option in \"DataField\" \"--DatabaseCompoundIDMode\". \n"; 1416 } 1417 $OptionsInfo{DatabaseCompoundIDField} = $Options{databasecompoundidfield}; 1418 } 1419 1420 1421 if (IsNotEmpty($Options{databasefingerprintsfield})) { 1422 $OptionsInfo{DatabaseFingerprintsField} = $Options{databasefingerprintsfield}; 1423 } 1424 else { 1425 $OptionsInfo{DatabaseFingerprintsField} = 'AutoDetect'; 1426 } 1427 1428 if ($Options{databasecompoundidfield} && IsNotEmpty($Options{databasefingerprintsfield})) { 1429 if (($Options{databasecompoundidfield} eq $Options{databasefingerprintsfield})) { 1430 die "Error: Values specified using \"--DatabaseCompoundIDField\" and \"--DatabaseFingerprintsfield\", $Options{databasecompoundidfield}, must be different.\n"; 1431 } 1432 } 1433 1434 # Database data field options for SD files... 1435 1436 $OptionsInfo{DatabaseDataFieldsMode} = $Options{databasedatafieldsmode}; 1437 $OptionsInfo{DatabaseDataFields} = ''; 1438 @{$OptionsInfo{SpecifiedDatabaseDataFields}} = (); 1439 1440 if ($Options{databasedatafieldsmode} =~ /^Specify$/i && !$Options{databasedatafields}) { 1441 die "Error: You must specify a value for \"--DatabaseDataFields\" option in \"Specify\" \"--DatabaseDataFieldsMode\". \n"; 1442 } 1443 if ($Options{databasedatafields}) { 1444 my(@SpecifiedDataFields); 1445 $OptionsInfo{DatabaseDataFields} = $Options{databasedatafields}; 1446 1447 @SpecifiedDataFields = split /\,/, $Options{databasedatafields}; 1448 push @{$OptionsInfo{SpecifiedDatabaseDataFields}}, @SpecifiedDataFields; 1449 } 1450 } 1451 1452 # Setup script usage and retrieve command line arguments specified using various options... 1453 sub SetupScriptUsage { 1454 1455 # Retrieve all the options... 1456 %Options = (); 1457 1458 $Options{alpha} = 0.5; 1459 $Options{beta} = 1; 1460 1461 $Options{bitvectorcomparisonmode} = "TanimotoSimilarity"; 1462 1463 $Options{databasecolmode} = 'colnum'; 1464 1465 $Options{databasecompoundidprefix} = 'Cmpd'; 1466 $Options{databasecompoundidmode} = 'LabelPrefix'; 1467 1468 $Options{databasedatacolsmode} = 'CompoundID'; 1469 $Options{databasedatafieldsmode} = 'CompoundID'; 1470 1471 $Options{distancecutoff} = 10; 1472 1473 $Options{referencecolmode} = 'colnum'; 1474 1475 $Options{referencecompoundidprefix} = 'Cmpd'; 1476 $Options{referencecompoundidmode} = 'LabelPrefix'; 1477 1478 $Options{detail} = 1; 1479 1480 $Options{fingerprintsmode} = 'AutoDetect'; 1481 $Options{groupfusionrule} = 'Max'; 1482 $Options{groupfusionapplycutoff} = 'Yes'; 1483 1484 $Options{knn} = 'All'; 1485 1486 $Options{mode} = 'MultipleReferences'; 1487 1488 $Options{numofsimilarmolecules} = 10; 1489 $Options{percentsimilarmolecules} = 1; 1490 1491 $Options{indelim} = 'comma'; 1492 $Options{outdelim} = 'comma'; 1493 $Options{quote} = 'yes'; 1494 1495 $Options{output} = 'text'; 1496 1497 $Options{precision} = 2; 1498 1499 $Options{searchmode} = 'SimilaritySearch'; 1500 1501 $Options{similarcountmode} = 'NumOfSimilar'; 1502 1503 $Options{similaritycutoff} = 0.75; 1504 1505 $Options{vectorcomparisonmode} = 'TanimotoSimilarity'; 1506 $Options{vectorcomparisonformulism} = 'AlgebraicForm'; 1507 1508 if (!GetOptions(\%Options, "alpha=f", "beta=f", "bitvectorcomparisonmode|b=s", "databasecolmode=s", "databasecompoundidcol=s", "databasecompoundidprefix=s", "databasecompoundidfield=s", "databasecompoundidmode=s", "databasedatacols=s", "databasedatacolsmode=s", "databasedatafields=s", "databasedatafieldsmode=s", "databasefingerprintscol=s", "databasefingerprintsfield=s", "distancecutoff=f", "detail|d=i", "fast|f", "fingerprintsmode=s", "groupfusionrule|g=s", , "groupfusionapplycutoff=s", "help|h", "indelim=s", "knn|k=s", "mode|m=s", "numofsimilarmolecules|n=i", "outdelim=s", "output=s", "overwrite|o", "percentsimilarmolecules|p=f", "precision=s", "quote|q=s", "referencecolmode=s", "referencecompoundidcol=s", "referencecompoundidprefix=s", "referencecompoundidfield=s", "referencecompoundidmode=s", "referencefingerprintscol=s", "referencefingerprintsfield=s", "root|r=s", "searchmode|s=s", "similarcountmode=s", "similaritycutoff=f", "vectorcomparisonmode|v=s", "vectorcomparisonformulism=s", "workingdir|w=s")) { 1509 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1510 } 1511 if ($Options{workingdir}) { 1512 if (! -d $Options{workingdir}) { 1513 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1514 } 1515 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1516 } 1517 if ($Options{databasecolmode} !~ /^(ColNum|ColLabel)$/i) { 1518 die "Error: The value specified, $Options{databasecolmode}, for option \"--DatabaseColMode\" is not valid. Allowed values: ColNum, or ColLabel\n"; 1519 } 1520 if ($Options{databasecompoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 1521 die "Error: The value specified, $Options{databasecompoundidmode}, for option \"--DatabaseCompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 1522 } 1523 if ($Options{databasedatacolsmode} !~ /^(All|Specify|CompoundID)$/i) { 1524 die "Error: The value specified, $Options{databasedatacolsmode}, for option \"--DatabaseDataColsMode\" is not valid. Allowed values: All, Specify, or CompoundID\n"; 1525 } 1526 if ($Options{databasedatafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { 1527 die "Error: The value specified, $Options{databasedatafieldsmode}, for option \"--DatabaseDataFieldsMode\" is not valid. Allowed values: All, Common, Specify, or CompoundID\n"; 1528 } 1529 if (!IsPositiveInteger($Options{detail})) { 1530 die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n"; 1531 } 1532 if ($Options{fingerprintsmode} !~ /^(AutoDetect|FingerprintsBitVectorString|FingerprintsVectorString)$/i) { 1533 die "Error: The value specified, $Options{fingerprintsmode}, for option \"--FingerprintsMode\" is not valid. Allowed values: AutoDetect, FingerprintsBitVectorString or FingerprintsVectorString \n"; 1534 } 1535 if ($Options{groupfusionrule} !~ /^(Max|Min|Mean|Median|Sum|Euclidean)$/i) { 1536 die "Error: The value specified, $Options{groupfusionrule}, for option \"-g, --GroupFusionRule\" is not valid. Allowed values: Max, Min, Mean, Median, Sum, Euclidean\n"; 1537 } 1538 if ($Options{groupfusionapplycutoff} !~ /^(Yes|No)$/i) { 1539 die "Error: The value specified, $Options{quote}, for option \"--GroupFusionApplyCutoff\" is not valid. Allowed values: Yes or No\n"; 1540 } 1541 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 1542 die "Error: The value specified, $Options{indelim}, for option \"--InDelim\" is not valid. Allowed values: comma, or semicolon\n"; 1543 } 1544 if ($Options{mode} !~ /^(IndividualReference|MultipleReferences)$/i) { 1545 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: IndividualReference, MultipleReferences\n"; 1546 } 1547 if (!IsPositiveInteger($Options{numofsimilarmolecules})) { 1548 die "Error: The value specified, $Options{numofsimilarmolecules}, for option \"-n, --NumOfSimilarMolecules\" is not valid. Allowed values: > 0 \n"; 1549 } 1550 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 1551 die "Error: The value specified, $Options{outdelim}, for option \"--OutDelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1552 } 1553 if ($Options{output} !~ /^(SD|text|both)$/i) { 1554 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n"; 1555 } 1556 if (!(IsFloat($Options{percentsimilarmolecules}) && $Options{percentsimilarmolecules} > 0 && $Options{percentsimilarmolecules} <= 100)) { 1557 die "Error: The value specified, $Options{percentsimilarmolecules}, for option \"-p, --PercentSimilarMolecules\" is not valid. Allowed values: > 0 and <= 100 \n"; 1558 } 1559 if ($Options{quote} !~ /^(Yes|No)$/i) { 1560 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; 1561 } 1562 if (!IsPositiveInteger($Options{precision})) { 1563 die "Error: The value specified, $Options{precision}, for option \"--precision\" is not valid. Allowed values: > 0 \n"; 1564 } 1565 if ($Options{referencecolmode} !~ /^(ColNum|ColLabel)$/i) { 1566 die "Error: The value specified, $Options{referencecolmode}, for option \"--ReferenceColMode\" is not valid. Allowed values: ColNum, or ColLabel\n"; 1567 } 1568 if ($Options{referencecompoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { 1569 die "Error: The value specified, $Options{referencecompoundidmode}, for option \"--ReferenceCompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; 1570 } 1571 if ($Options{searchmode} !~ /^(SimilaritySearch|DissimilaritySearch)$/i) { 1572 die "Error: The value specified, $Options{searchmode}, for option \"-s, --SearchMode\" is not valid. Allowed values: SimilaritySearch, DissimilaritySearch \n"; 1573 } 1574 if ($Options{similarcountmode} !~ /^(NumOfSimilar|PercentSimilar)$/i) { 1575 die "Error: The value specified, $Options{similarcountmode}, for option \"--SimilarCountMode\" is not valid. Allowed values: NumOfSimilar, PercentSimilar \n"; 1576 } 1577 } 1578