1 package Fingerprints::FingerprintsVector; 2 # 3 # File: FingerprintsVector.pm 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use Carp; 28 use Exporter; 29 use Scalar::Util (); 30 use MathUtil (); 31 use TextUtil (); 32 use StatisticsUtil (); 33 use BitVector; 34 use Vector; 35 36 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 37 38 @ISA = qw(Exporter); 39 40 # Distance coefficients 41 my(@DistanceCoefficients) = qw(CityBlockDistanceCoefficient EuclideanDistanceCoefficient HammingDistanceCoefficient ManhattanDistanceCoefficient SoergelDistanceCoefficient); 42 43 # Similarity coefficients... 44 my(@SimilarityCoefficients) = qw(CosineSimilarityCoefficient CzekanowskiSimilarityCoefficient DiceSimilarityCoefficient OchiaiSimilarityCoefficient JaccardSimilarityCoefficient SorensonSimilarityCoefficient TanimotoSimilarityCoefficient); 45 46 # New from string... 47 my(@NewFromString) = qw(NewFromValuesString NewFromValuesAndIDsString NewFromIDsAndValuesString NewFromValuesAndIDsPairsString NewFromIDsAndValuesPairsString); 48 49 @EXPORT = qw(IsFingerprintsVector); 50 @EXPORT_OK = qw(GetSupportedDistanceCoefficients GetSupportedSimilarityCoefficients GetSupportedDistanceAndSimilarityCoefficients @DistanceCoefficients @SimilarityCoefficients); 51 52 %EXPORT_TAGS = ( 53 new => [@NewFromString], 54 distancecoefficients => [@DistanceCoefficients], 55 similaritycoefficients => [@SimilarityCoefficients], 56 all => [@EXPORT, @EXPORT_OK] 57 ); 58 59 # Setup class variables... 60 my($ClassName); 61 _InitializeClass(); 62 63 # Overload Perl functions... 64 use overload '""' => 'StringifyFingerprintsVector'; 65 66 # Class constructor... 67 sub new { 68 my($Class, %NamesAndValues) = @_; 69 70 # Initialize object... 71 my $This = {}; 72 bless $This, ref($Class) || $Class; 73 74 $This->_InitializeFingerprintsVector(); 75 76 $This->_InitializeFingerprintsVectorProperties(%NamesAndValues); 77 78 return $This; 79 } 80 81 # Initialize object data... 82 # 83 sub _InitializeFingerprintsVector { 84 my($This) = @_; 85 86 # Type of fingerprint vector... 87 $This->{Type} = ''; 88 89 # Fingerprint vector values... 90 @{$This->{Values}} = (); 91 92 # Fingerprint vector value IDs... 93 @{$This->{ValueIDs}} = (); 94 95 return $This; 96 } 97 98 # Initialize class ... 99 sub _InitializeClass { 100 #Class name... 101 $ClassName = __PACKAGE__; 102 } 103 104 # Initialize object properties.... 105 sub _InitializeFingerprintsVectorProperties { 106 my($This, %NamesAndValues) = @_; 107 108 my($Name, $Value, $MethodName); 109 while (($Name, $Value) = each %NamesAndValues) { 110 $MethodName = "Set${Name}"; 111 $This->$MethodName($Value); 112 } 113 114 if (!exists $NamesAndValues{Type}) { 115 croak "Error: ${ClassName}->New: Object can't be instantiated without specifying type..."; 116 } 117 return $This; 118 } 119 120 # Create a new fingerprints vector using space delimited values string. This functionality can be 121 # either invoked as a class function or an object method. 122 # 123 sub NewFromValuesString ($$;$) { 124 my($FirstParameter, $SecondParameter, $ThirdParamater) = @_; 125 my($This, $Type, $ValuesString); 126 127 if (@_ == 3) { 128 ($This, $Type, $ValuesString) = ($FirstParameter, $SecondParameter, $ThirdParamater); 129 } 130 else { 131 ($This, $Type, $ValuesString) = (undef, $FirstParameter, $SecondParameter); 132 } 133 my($FingerprintsVector, @Values); 134 135 @Values = (); 136 if (defined($ValuesString) && length($ValuesString) && $ValuesString !~ /^None$/i) { 137 @Values = split(' ', $ValuesString); 138 } 139 140 $FingerprintsVector = new Fingerprints::FingerprintsVector('Type' => $Type, 'Values' => \@Values); 141 142 return $FingerprintsVector; 143 } 144 145 # Create a new fingerprints vector using values and IDs string containing semicolon 146 # delimited value string and value IDs strings. The values within value and value IDs 147 # string are delimited by spaces. 148 # 149 # This functionality can be either invoked as a class function or an object method. 150 # 151 sub NewFromValuesAndIDsString ($$;$) { 152 my($FirstParameter, $SecondParameter, $ThirdParamater) = @_; 153 my($This, $Type, $ValuesAndIDsString); 154 155 if (@_ == 3) { 156 ($This, $Type, $ValuesAndIDsString) = ($FirstParameter, $SecondParameter, $ThirdParamater); 157 } 158 else { 159 ($This, $Type, $ValuesAndIDsString) = (undef, $FirstParameter, $SecondParameter); 160 } 161 my($FingerprintsVector, $ValuesString, $ValueIDsString, @Values, @ValueIDs); 162 163 ($ValuesString, $ValueIDsString) = split(';', $ValuesAndIDsString); 164 165 @Values = (); 166 if (defined($ValuesString) && length($ValuesString) && $ValuesString !~ /^None$/i) { 167 @Values = split(' ', $ValuesString); 168 } 169 @ValueIDs = (); 170 if (defined($ValueIDsString) && length($ValueIDsString) && $ValueIDsString !~ /^None$/i) { 171 @ValueIDs = split(' ', $ValueIDsString); 172 } 173 174 if (@Values != @ValueIDs ) { 175 carp "Warning: ${ClassName}->NewFromValuesAndIDsString: Object can't be instantiated: Number specified values, " . scalar @Values . ", must be equal to number of specified value IDs, " . scalar @ValueIDs . "..."; 176 return undef; 177 } 178 179 $FingerprintsVector = new Fingerprints::FingerprintsVector('Type' => $Type, 'Values' => \@Values, 'ValueIDs' => \@ValueIDs); 180 181 return $FingerprintsVector; 182 } 183 184 # Create a new fingerprints vector using IDs and values string containing semicolon 185 # delimited value IDs string and values strings. The values within value and value IDs 186 # string are delimited by spaces. 187 # 188 # This functionality can be either invoked as a class function or an object method. 189 # 190 sub NewFromIDsAndValuesString ($$;$) { 191 my($FirstParameter, $SecondParameter, $ThirdParamater) = @_; 192 my($This, $Type, $IDsAndValuesString); 193 194 if (@_ == 3) { 195 ($This, $Type, $IDsAndValuesString) = ($FirstParameter, $SecondParameter, $ThirdParamater); 196 } 197 else { 198 ($This, $Type, $IDsAndValuesString) = (undef, $FirstParameter, $SecondParameter); 199 } 200 my($FingerprintsVector, $ValuesString, $ValueIDsString, @Values, @ValueIDs); 201 202 ($ValueIDsString, $ValuesString) = split(';', $IDsAndValuesString); 203 204 @Values = (); 205 if (defined($ValuesString) && length($ValuesString) && $ValuesString !~ /^None$/i) { 206 @Values = split(' ', $ValuesString); 207 } 208 @ValueIDs = (); 209 if (defined($ValueIDsString) && length($ValueIDsString) && $ValueIDsString !~ /^None$/i) { 210 @ValueIDs = split(' ', $ValueIDsString); 211 } 212 213 if (@Values != @ValueIDs ) { 214 carp "Warning: ${ClassName}->NewFromIDsAndValuesString: Object can't be instantiated: Number specified values, " . scalar @Values . ", must be equal to number of specified value IDs, " . scalar @ValueIDs . "..."; 215 return undef; 216 } 217 218 $FingerprintsVector = new Fingerprints::FingerprintsVector('Type' => $Type, 'Values' => \@Values, 'ValueIDs' => \@ValueIDs); 219 220 return $FingerprintsVector; 221 } 222 223 # Create a new fingerprints vector using values and IDs pairs string containing space 224 # value and value IDs pairs. 225 # 226 # This functionality can be either invoked as a class function or an object method. 227 # 228 sub NewFromValuesAndIDsPairsString ($$;$) { 229 my($FirstParameter, $SecondParameter, $ThirdParamater) = @_; 230 my($This, $Type, $ValuesAndIDsPairsString); 231 232 if (@_ == 3) { 233 ($This, $Type, $ValuesAndIDsPairsString) = ($FirstParameter, $SecondParameter, $ThirdParamater); 234 } 235 else { 236 ($This, $Type, $ValuesAndIDsPairsString) = (undef, $FirstParameter, $SecondParameter); 237 } 238 my($FingerprintsVector, $Index, @Values, @ValueIDs, @ValuesAndIDsPairs); 239 240 @ValuesAndIDsPairs = split(' ', $ValuesAndIDsPairsString); 241 if (@ValuesAndIDsPairs % 2) { 242 carp "Warning: ${ClassName}->NewFromValuesAndIDsPairsString: No fingerprint vector created: Invalid values and IDs pairs data: Input list must contain even number of values and IDs pairs..."; 243 return undef; 244 } 245 246 @Values = (); @ValueIDs = (); 247 if (!(@ValuesAndIDsPairs == 2 && $ValuesAndIDsPairs[0] =~ /^None$/i && $ValuesAndIDsPairs[1] =~ /^None$/i)) { 248 for ($Index = 0; $Index < $#ValuesAndIDsPairs; $Index += 2) { 249 push @Values, $ValuesAndIDsPairs[$Index]; 250 push @ValueIDs, $ValuesAndIDsPairs[$Index + 1]; 251 } 252 } 253 $FingerprintsVector = new Fingerprints::FingerprintsVector('Type' => $Type, 'Values' => \@Values, 'ValueIDs' => \@ValueIDs); 254 255 return $FingerprintsVector; 256 } 257 258 # Create a new fingerprints vector using IDs and values pairs string containing space 259 # value IDs and valus pairs. 260 # 261 # This functionality can be either invoked as a class function or an object method. 262 # 263 sub NewFromIDsAndValuesPairsString ($$;$) { 264 my($FirstParameter, $SecondParameter, $ThirdParamater) = @_; 265 my($This, $Type, $IDsAndValuesPairsString); 266 267 if (@_ == 3) { 268 ($This, $Type, $IDsAndValuesPairsString) = ($FirstParameter, $SecondParameter, $ThirdParamater); 269 } 270 else { 271 ($This, $Type, $IDsAndValuesPairsString) = (undef, $FirstParameter, $SecondParameter); 272 } 273 my($FingerprintsVector, $Index, @Values, @ValueIDs, @IDsAndValuesPairs); 274 275 @IDsAndValuesPairs = split(' ', $IDsAndValuesPairsString); 276 if (@IDsAndValuesPairs % 2) { 277 croak "Error: ${ClassName}->NewFromIDsAndValuesPairsString: No fingerprint vector created: Invalid values and IDs pairs data: Input list must contain even number of values and IDs pairs..."; 278 return undef; 279 } 280 281 @Values = (); @ValueIDs = (); 282 if (!(@IDsAndValuesPairs == 2 && $IDsAndValuesPairs[0] =~ /^None$/i && $IDsAndValuesPairs[1] =~ /^None$/i)) { 283 for ($Index = 0; $Index < $#IDsAndValuesPairs; $Index += 2) { 284 push @ValueIDs, $IDsAndValuesPairs[$Index]; 285 push @Values, $IDsAndValuesPairs[$Index + 1]; 286 } 287 } 288 $FingerprintsVector = new Fingerprints::FingerprintsVector('Type' => $Type, 'Values' => \@Values, 'ValueIDs' => \@ValueIDs); 289 290 return $FingerprintsVector; 291 } 292 293 # Set type of fingerprint vector. Supported types are: OrderedNumericalValues, NumericalValues, and 294 # AlphaNumericalValues 295 # 296 # . For OrderedNumericalValues type, both vectors must be of the same size and contain similar 297 # types of numerical values in the same order. 298 # 299 # . For NumericalValues type, vector value IDs for both vectors must be specified; however, their 300 # size and order of IDs and numerical values may be different. For each vector, value IDs must 301 # correspond to vector values. 302 # 303 # . For AlphaNumericalValues type, vectors may contain both numerical and alphanumerical values 304 # and their sizes may be different. 305 # 306 sub SetType { 307 my($This, $Type) = @_; 308 309 if ($Type !~ /^(OrderedNumericalValues|NumericalValues|AlphaNumericalValues)$/i) { 310 croak "Error: ${ClassName}->SetType: Specified value, $Type, for Type is not vaild. Supported types in current release of MayaChemTools: OrderedNumericalValues, NumericalValues or AlphaNumericalValues"; 311 } 312 313 if ($This->{Type}) { 314 croak "Error: ${ClassName}->SetType: Can't change intial fingerprints vector type: It's already set..."; 315 } 316 $This->{Type} = $Type; 317 318 return $This; 319 } 320 321 # Get fingerpints vector type... 322 # 323 sub GetType { 324 my($This) = @_; 325 326 return $This->{Type}; 327 } 328 329 # Set ID... 330 sub SetID { 331 my($This, $Value) = @_; 332 333 $This->{ID} = $Value; 334 335 return $This; 336 } 337 338 # Get ID... 339 sub GetID { 340 my($This) = @_; 341 342 return exists $This->{ID} ? $This->{ID} : 'None'; 343 } 344 345 # Set description... 346 sub SetDescription { 347 my($This, $Value) = @_; 348 349 $This->{Description} = $Value; 350 351 return $This; 352 } 353 354 # Get description... 355 sub GetDescription { 356 my($This) = @_; 357 358 return exists $This->{Description} ? $This->{Description} : 'No description available'; 359 } 360 361 # Set vector type... 362 sub SetVectorType { 363 my($This, $Value) = @_; 364 365 $This->{VectorType} = $Value; 366 367 return $This; 368 } 369 370 # Get vector type... 371 sub GetVectorType { 372 my($This) = @_; 373 374 return exists $This->{VectorType} ? $This->{VectorType} : 'FingerprintsVector'; 375 } 376 377 # Set values of a fingerprint vector using a vector, reference to an array or an array... 378 # 379 sub SetValues { 380 my($This, @Values) = @_; 381 382 $This->_SetOrAddValuesOrValueIDs("SetValues", @Values); 383 384 return $This; 385 } 386 387 # Set value IDs of a fingerprint vector using a vector, reference to an array or an array... 388 # 389 sub SetValueIDs { 390 my($This, @Values) = @_; 391 392 $This->_SetOrAddValuesOrValueIDs("SetValueIDs", @Values); 393 394 return $This; 395 } 396 397 # Add values to a fingerprint vector using a vector, reference to an array or an array... 398 # 399 sub AddValues { 400 my($This, @Values) = @_; 401 402 $This->_SetOrAddValuesOrValueIDs("AddValues", @Values); 403 404 return $This; 405 } 406 407 # Add value IDs to a fingerprint vector using a vector, reference to an array or an array... 408 # 409 sub AddValueIDs { 410 my($This, @Values) = @_; 411 412 $This->_SetOrAddValuesOrValueIDs("AddValueIDs", @Values); 413 414 return $This; 415 } 416 417 # Set or add values or value IDs using: 418 # 419 # o List of values or ValueIDs 420 # o Reference to an list of values or ValuesIDs 421 # o A vector containing values or ValueIDs 422 # 423 sub _SetOrAddValuesOrValueIDs { 424 my($This, $Mode, @Values) = @_; 425 426 if (!@Values) { 427 return; 428 } 429 430 # Collect specified values or valueIDs... 431 my($FirstValue, $TypeOfFirstValue, $ValuesRef); 432 433 $FirstValue = $Values[0]; 434 $TypeOfFirstValue = ref $FirstValue; 435 if ($TypeOfFirstValue =~ /^(SCALAR|HASH|CODE|REF|GLOB)/) { 436 croak "Error: ${ClassName}-> _SetOrAddValuesOrValueIDs: Trying to add values to vector object with a reference to unsupported value format..."; 437 } 438 439 if (Vector::IsVector($FirstValue)) { 440 # It's a vector... 441 $ValuesRef = $FirstValue->GetValues(); 442 } 443 elsif ($TypeOfFirstValue =~ /^ARRAY/) { 444 # It's an array refernce... 445 $ValuesRef = $FirstValue; 446 } 447 else { 448 # It's a list of values... 449 $ValuesRef = \@Values; 450 } 451 452 # Set or add values or value IDs... 453 MODE: { 454 if ($Mode =~ /^SetValues$/i) { @{$This->{Values}} = (); push @{$This->{Values}}, @{$ValuesRef}; last MODE; } 455 if ($Mode =~ /^SetValueIDs$/i) { @{$This->{ValueIDs}} = (); push @{$This->{ValueIDs}}, @{$ValuesRef}; last MODE; } 456 if ($Mode =~ /^AddValues$/i) { push @{$This->{Values}}, @{$ValuesRef}; last MODE; } 457 if ($Mode =~ /^AddValueIDs$/i) { push @{$This->{ValueIDs}}, @{$ValuesRef}; last MODE; } 458 croak "Error: ${ClassName}-> _SetOrAddValuesOrValueIDs: Unknown mode $Mode..."; 459 } 460 return $This; 461 } 462 463 # Set a specific value in fingerprint vector with indicies starting from 0.. 464 # 465 sub SetValue { 466 my($This, $Index, $Value, $SkipCheck) = @_; 467 468 # Just set it... 469 if ($SkipCheck) { 470 return $This->_SetValue($Index, $Value); 471 } 472 473 # Check and set... 474 if ($Index < 0) { 475 croak "Error: ${ClassName}->SetValue: Index value must be a positive number..."; 476 } 477 if ($Index >= $This->GetNumOfValues()) { 478 croak "Error: ${ClassName}->SetValue: Index vaue must be less than number of values..."; 479 } 480 481 return $This->_SetValue($Index, $Value); 482 } 483 484 # Set a fingerprint vector value... 485 # 486 sub _SetValue { 487 my($This, $Index, $Value) = @_; 488 489 $This->{Values}[$Index] = $Value; 490 491 return $This; 492 } 493 494 # Get a specific value from fingerprint vector with indicies starting from 0... 495 # 496 sub GetValue { 497 my($This, $Index) = @_; 498 499 if ($Index < 0) { 500 croak "Error: ${ClassName}->GetValue: Index value must be a positive number..."; 501 } 502 if ($Index >= $This->GetNumOfValues()) { 503 croak "Error: ${ClassName}->GetValue: Index value must be less than number of values..."; 504 } 505 return $This->_GetValue($Index); 506 } 507 508 # Get a fingerprint vector value... 509 sub _GetValue { 510 my($This, $Index) = @_; 511 512 return $This->{Values}[$Index]; 513 } 514 515 # Return vector values as an array or reference to an array... 516 # 517 sub GetValues { 518 my($This) = @_; 519 520 return wantarray ? @{$This->{Values}} : \@{$This->{Values}}; 521 } 522 523 # Set a specific value ID in fingerprint vector with indicies starting from 0.. 524 # 525 sub SetValueID { 526 my($This, $Index, $Value, $SkipCheck) = @_; 527 528 # Just set it... 529 if ($SkipCheck) { 530 return $This->_SetValueID($Index, $Value); 531 } 532 533 # Check and set... 534 if ($Index < 0) { 535 croak "Error: ${ClassName}->SetValueID: Index value must be a positive number..."; 536 } 537 if ($Index >= $This->GetNumOfValueIDs()) { 538 croak "Error: ${ClassName}->SetValueID: Index vaue must be less than number of value IDs..."; 539 } 540 541 return $This->_SetValueID($Index, $Value); 542 } 543 544 # Set a fingerprint vector value ID... 545 # 546 sub _SetValueID { 547 my($This, $Index, $Value) = @_; 548 549 $This->{ValueIDs}[$Index] = $Value; 550 551 return $This; 552 } 553 554 # Get a specific value ID from fingerprint vector with indicies starting from 0... 555 # 556 sub GetValueID { 557 my($This, $Index) = @_; 558 559 if ($Index < 0) { 560 croak "Error: ${ClassName}->GetValueID: Index value must be a positive number..."; 561 } 562 if ($Index >= $This->GetNumOfValueIDs()) { 563 croak "Error: ${ClassName}->GetValueID: Index value must be less than number of value IDs..."; 564 } 565 return $This->_GetValueID($Index); 566 } 567 568 # Get a fingerprint vector value ID... 569 # 570 sub _GetValueID { 571 my($This, $Index) = @_; 572 573 return $This->{ValueIDs}[$Index]; 574 } 575 576 # Return vector value IDs as an array or reference to an array... 577 # 578 sub GetValueIDs { 579 my($This) = @_; 580 581 return wantarray ? @{$This->{ValueIDs}} : \@{$This->{ValueIDs}}; 582 } 583 584 # Get fingerprints vector string containing values and/or IDs string in a specifed format... 585 # 586 sub GetFingerprintsVectorString { 587 my($This, $Format) = @_; 588 589 FORMAT : { 590 if ($Format =~ /^(IDsAndValuesString|IDsAndValues)$/i) { return $This->GetIDsAndValuesString(); last FORMAT; } 591 if ($Format =~ /^(IDsAndValuesPairsString|IDsAndValuesPairs)$/i) { return $This->GetIDsAndValuesPairsString(); last FORMAT; } 592 if ($Format =~ /^(ValuesAndIDsString|ValuesAndIDs)$/i) { return $This->GetValuesAndIDsString(); last FORMAT; } 593 if ($Format =~ /^(ValuesAndIDsPairsString|ValuesAndIDsPairs)$/i) { return $This->GetValuesAndIDsPairsString(); last FORMAT;} 594 if ($Format =~ /^(ValueIDsString|ValueIDs)$/i) { return $This->GetValueIDsString(); last FORMAT; } 595 if ($Format =~ /^(ValuesString|Values)$/i) { return $This->GetValuesString(); last FORMAT; } 596 croak "Error: ${ClassName}->GetFingerprintsVectorString: Specified vector string format, $Format, is not supported. Value values: IDsAndValuesString, IDsAndValues, IDsAndValuesPairsString, IDsAndValuesPairs, ValuesAndIDsString, ValuesAndIDs, ValuesAndIDsPairsString, ValuesAndIDsPairs, ValueIDsString, ValueIDs, ValuesString, Values..."; 597 } 598 return ''; 599 } 600 # Get vector value IDs and values string as space delimited ASCII string separated 601 # by semicolon... 602 # 603 sub GetIDsAndValuesString { 604 my($This) = @_; 605 606 if (@{$This->{ValueIDs}} && @{$This->{Values}}) { 607 # Both IDs and values are available... 608 return join(' ', @{$This->{ValueIDs}}) . ";" . join(' ', @{$This->{Values}}); 609 } 610 elsif (@{$This->{Values}}) { 611 # Only values are available... 612 return "None;" . join(' ', @{$This->{Values}}); 613 } 614 else { 615 # Values are not available... 616 return "None;None"; 617 } 618 } 619 620 # Get vector value IDs and value pairs string as space delimited ASCII string... 621 # 622 sub GetIDsAndValuesPairsString { 623 my($This) = @_; 624 my($Index, $ValueIDsPresent, @IDsAndValuesPairs); 625 626 if (!@{$This->{Values}}) { 627 # Values are unavailable... 628 return "None None"; 629 } 630 631 $ValueIDsPresent = @{$This->{ValueIDs}} ? 1 : 0; 632 633 @IDsAndValuesPairs = (); 634 for $Index (0 .. $#{$This->{Values}}) { 635 if ($ValueIDsPresent) { 636 push @IDsAndValuesPairs, ($This->{ValueIDs}->[$Index], $This->{Values}->[$Index]); 637 } 638 else { 639 push @IDsAndValuesPairs, ('None', $This->{Values}->[$Index]); 640 } 641 } 642 return join(' ', @IDsAndValuesPairs); 643 } 644 645 # Get vector value and value IDs string as space delimited ASCII string separated 646 # by semicolon... 647 # 648 sub GetValuesAndIDsString { 649 my($This) = @_; 650 651 if (@{$This->{ValueIDs}} && @{$This->{Values}}) { 652 # Both IDs and values are available... 653 return join(' ', @{$This->{Values}}) . ";" . join(' ', @{$This->{ValueIDs}}); 654 } 655 elsif (@{$This->{Values}}) { 656 # Only values are available... 657 return join(' ', @{$This->{Values}}) . ";None"; 658 } 659 else { 660 # Values are not available... 661 return "None;None"; 662 } 663 } 664 665 # Get vector value and value ID pairs string as space delimited ASCII string... 666 # 667 sub GetValuesAndIDsPairsString { 668 my($This) = @_; 669 my($Index, $ValueIDsPresent, @ValuesAndIDsPairs); 670 671 if (!@{$This->{Values}}) { 672 # Values are unavailable... 673 return "None None"; 674 } 675 676 $ValueIDsPresent = @{$This->{ValueIDs}} ? 1 : 0; 677 678 @ValuesAndIDsPairs = (); 679 for $Index (0 .. $#{$This->{Values}}) { 680 if ($ValueIDsPresent) { 681 push @ValuesAndIDsPairs, ($This->{Values}->[$Index], $This->{ValueIDs}->[$Index]); 682 } 683 else { 684 push @ValuesAndIDsPairs, ($This->{Values}->[$Index], 'None'); 685 } 686 } 687 return join(' ', @ValuesAndIDsPairs); 688 } 689 690 # Get vector value IDs string as space delimited ASCII string... 691 # 692 sub GetValueIDsString { 693 my($This) = @_; 694 695 return @{$This->{ValueIDs}} ? join(' ', @{$This->{ValueIDs}}) : 'None'; 696 } 697 698 # Get vector value string as space delimited ASCII string... 699 # 700 sub GetValuesString { 701 my($This) = @_; 702 703 return @{$This->{Values}} ? join(' ', @{$This->{Values}}) : 'None'; 704 } 705 706 # Get number of values... 707 sub GetNumOfValues { 708 my($This) = @_; 709 710 return scalar @{$This->{Values}}; 711 } 712 713 # Get number of non-zero values... 714 sub GetNumOfNonZeroValues { 715 my($This) = @_; 716 my($Count, $Index, $Size); 717 718 $Count = 0; 719 $Size = $This->GetNumOfValues(); 720 721 for $Index (0 .. ($Size -1)) { 722 if ($This->{Values}[$Index] != 0) { 723 $Count++; 724 } 725 } 726 return $Count; 727 } 728 729 # Get number of value IDs... 730 sub GetNumOfValueIDs { 731 my($This) = @_; 732 733 return scalar @{$This->{ValueIDs}}; 734 } 735 736 # FinegerprintsVectors class provides methods to calculate similarity between vectors 737 # containing three different types of values: 738 # 739 # Type I: OrderedNumericalValues 740 # 741 # . Size of two vectors are same 742 # . Vectors contain real values in a specific order. For example: MACCS keys count, Topological 743 # pharnacophore atom pairs and so on. 744 # . Option to calculate similarity value using continious values or binary values 745 # 746 # Type II: UnorderedNumericalValues 747 # 748 # . Size of two vectors might not be same 749 # . Vectors contain unordered real value identified by value IDs. For example: Toplogical atom pairs, 750 # Topological atom torsions and so on 751 # . Option to calculate similarity value using continous values or binary values 752 # 753 # Type III: AlphaNumericalValues 754 # 755 # . Size of two vectors might not be same 756 # . Vectors contain unordered alphanumerical values. For example: Extended connectivity fingerprints, 757 # atom neighbothood fingerpritns. 758 # . The vector values are treated as keys or bit indices and similarity value is calculated accordingly. 759 # 760 # Before performing similarity or distance calculations between vectors containing UnorderedNumericalValues 761 # or AlphaNumericalValues, the vectors are tranformed into vectors containing unique OrderedNumericalValues 762 # using value IDs for UnorderedNumericalValues and values itself for AlphaNumericalValues. 763 # 764 # Three forms similarity or distance calculation between two vectors: AlgebraicForm, BinaryForm or 765 # SetTheoreticForm. 766 # 767 # The value of an extra paramter, CalculationMode, passed to each similarity or distance function 768 # controls the calculation. Supported values for CalculationMode: AlgebraicForm, BinaryForm and 769 # SetTheoreticForm. Default: AlgebraicForm. 770 # 771 # For BinaryForm CalculationMode, the ordered list of processed final vector values containing the value or 772 # count of each unique value type is simply converted into a binary vector containing 1s and 0s 773 # corresponding to presence or absence of values before calculating similarity or distance between 774 # two vectors. 775 # 776 # For two fingerprint vectors A and B of same size containing OrderedNumericalValues, let: 777 # 778 # N = Number values in A or B 779 # 780 # Xa = Values of vector A 781 # Xb = Values of vector B 782 # 783 # Xai = Value of ith element in A 784 # Xbi = Value of ith element in B 785 # 786 # SUM = Sum of i over N values 787 # 788 # For SetTheoreticForm of calculation between two vectors, let: 789 # 790 # SetIntersectionXaXb = SUM ( MIN ( Xai, Xbi ) ) 791 # SetDifferenceXaXb = SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) 792 # 793 # For BinaryForm of calculation between two vectors, let: 794 # 795 # Na = Number of bits set to "1" in A = SUM ( Xai ) 796 # Nb = Number of bits set to "1" in B = SUM ( Xbi ) 797 # Nc = Number of bits set to "1" in both A and B = SUM ( Xai * Xbi ) 798 # Nd = Number of bits set to "0" in both A and B = SUM ( 1 - Xai - Xbi + Xai * Xbi) 799 # 800 # N = Number of bits set to "1" or "0" in A or B = Size of A or B = Na + Nb - Nc + Nd 801 # 802 # Additionally, for BinaryForm various values also correspond to: 803 # 804 # Na = | Xa | 805 # Nb = | Xb | 806 # Nc = | SetIntersectionXaXb | 807 # Nd = N - | SetDifferenceXaXb | 808 # 809 # | SetDifferenceXaXb | = N - Nd = Na + Nb - Nc + Nd - Nd = Na + Nb - Nc 810 # = | Xa | + | Xb | - | SetIntersectionXaXb | 811 # 812 # Various distance coefficients and similarity coefficients [ Ref 40, Ref 62, Ref 64 ] for a pair vectors A and B 813 # in AlgebraicForm and BinaryForm are defined as follows: 814 # 815 # . CityBlockDistanceCoefficient: ( same as HammingDistanceCoefficient and ManhattanDistanceCoefficient) 816 # 817 # . AlgebraicForm: SUM ( ABS ( Xai - Xbi ) ) 818 # 819 # . BinaryForm: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc 820 # 821 # . SetTheoreticForm: | SetDifferenceXaXb | - | SetIntersectionXaXb | 822 # = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) 823 # 824 # . CosineSimilarityCoefficient: ( same as OchiaiSimilarityCoefficient) 825 # 826 # . AlgebraicForm: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM ( Xbi ** 2) ) 827 # 828 # . BinaryForm: Nc / SQRT ( Na * Nb) 829 # 830 # . SetTheoreticForm: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) 831 # = SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) ) 832 # 833 # . CzekanowskiSimilarityCoefficient: ( same as DiceSimilarityCoefficient and SorensonSimilarityCoefficient) 834 # 835 # . AlgebraicForm: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) ) 836 # 837 # . BinaryForm: 2 * Nc / ( Na + Nb ) 838 # 839 # . SetTheoreticForm: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) 840 # = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) ) 841 # 842 # . DiceSimilarityCoefficient: ( same as CzekanowskiSimilarityCoefficient and SorensonSimilarityCoefficient) 843 # 844 # . AlgebraicForm: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) ) 845 # 846 # . BinaryForm: 2 * Nc / ( Na + Nb ) 847 # 848 # . SetTheoreticForm: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) 849 # = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) ) 850 # 851 # . EuclideanDistanceCoefficient: 852 # 853 # . AlgebraicForm: SQRT ( SUM ( ( ( Xai - Xbi ) ** 2 ) ) ) 854 # 855 # . BinaryForm: SQRT ( ( Na - Nc ) + ( Nb - Nc ) ) = SQRT ( Na + Nb - 2 * Nc ) 856 # 857 # . SetTheoreticForm: SQRT ( | SetDifferenceXaXb | - | SetIntersectionXaXb | ) 858 # = SQRT ( SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) ) 859 # 860 # . HammingDistanceCoefficient: ( same as CityBlockDistanceCoefficient and ManhattanDistanceCoefficient) 861 # 862 # . AlgebraicForm: SUM ( ABS ( Xai - Xbi ) ) 863 # 864 # . BinaryForm: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc 865 # 866 # . SetTheoreticForm: | SetDifferenceXaXb | - | SetIntersectionXaXb | 867 # = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) 868 # 869 # . JaccardSimilarityCoefficient: ( same as TanimotoSimilarityCoefficient) 870 # 871 # . AlgebraicForm: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi ** 2 ) - SUM ( Xai * Xbi ) ) 872 # 873 # . BinaryForm: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) 874 # 875 # . SetTheoreticForm: | SetIntersectionXaXb | / | SetDifferenceXaXb | 876 # = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) ) 877 # 878 # . ManhattanDistanceCoefficient: ( same as CityBlockDistanceCoefficient and HammingDistanceCoefficient) 879 # 880 # . AlgebraicForm: SUM ( ABS ( Xai - Xbi ) ) 881 # 882 # . BinaryForm: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc 883 # 884 # . SetTheoreticForm: | SetDifferenceXaXb | - | SetIntersectionXaXb | 885 # = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) 886 # 887 # . OchiaiSimilarityCoefficient: ( same as CosineSimilarityCoefficient) 888 # 889 # . AlgebraicForm: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM ( Xbi ** 2) ) 890 # 891 # . BinaryForm: Nc / SQRT ( Na * Nb) 892 # 893 # . SetTheoreticForm: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) 894 # = SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) ) 895 # 896 # . SorensonSimilarityCoefficient: ( same as CzekanowskiSimilarityCoefficient and DiceSimilarityCoefficient) 897 # 898 # . AlgebraicForm: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) ) 899 # 900 # . BinaryForm: 2 * Nc / ( Na + Nb ) 901 # 902 # . SetTheoreticForm: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) 903 # = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) ) 904 # 905 # . SoergelDistanceCoefficient: 906 # 907 # . AlgebraicForm: SUM ( ABS ( Xai - Xbi ) ) / SUM ( MAX ( Xai, Xbi ) ) 908 # 909 # . BinaryForm: 1 - Nc / ( Na + Nb - Nc ) = ( Na + Nb - 2 * Nc ) / ( Na + Nb - Nc ) 910 # 911 # . SetTheoreticForm: ( | SetDifferenceXaXb | - | SetIntersectionXaXb | ) / | SetDifferenceXaXb | 912 # = ( SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) ) 913 # 914 # . TanimotoSimilarityCoefficient: ( same as JaccardSimilarityCoefficient) 915 # 916 # . AlgebraicForm: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi ** 2 ) - SUM ( Xai * Xbi ) ) 917 # 918 # . BinaryForm: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) 919 # 920 # . SetTheoreticForm: | SetIntersectionXaXb | / | SetDifferenceXaXb | 921 # = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) ) 922 # 923 # 924 925 # Calculate Hamming distance coefficient between two fingerprint vectors. 926 # 927 # This functionality can be either invoked as a class function or an object method. 928 # 929 sub HammingDistanceCoefficient ($$;$$) { 930 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 931 932 return CityBlockDistanceCoefficient($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 933 } 934 935 # Calculate Hamming distance coefficient between two fingerprint vectors. 936 # 937 # This functionality can be either invoked as a class function or an object method. 938 # 939 sub ManhattanDistanceCoefficient ($$;$$) { 940 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 941 942 return CityBlockDistanceCoefficient($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 943 } 944 945 # Calculate CityBlock distance coefficient between two fingerprint vectors. 946 # 947 # This functionality can be either invoked as a class function or an object method. 948 # 949 sub CityBlockDistanceCoefficient ($$;$$) { 950 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 951 952 $CalculationMode = defined $CalculationMode ? $CalculationMode : 'AlgebraicForm'; 953 $SkipValuesCheck = defined $SkipValuesCheck ? $SkipValuesCheck : 0; 954 955 # Validate and process fingerprints vectors for similarity calculations... 956 # 957 _ValidateAndProcessFingerprintsVectorsForSimilarityCalculation("CityBlockDistanceCoefficient: Calculation failed", $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 958 959 # Perform the calculation... 960 if ($CalculationMode =~ /^AlgebraicForm$/i) { 961 return _CityBlockDistanceCoefficientUsingAlgebraicForm($FingerprintsVectorA, $FingerprintsVectorB); 962 } 963 elsif ($CalculationMode =~ /^BinaryForm$/i) { 964 return _CityBlockDistanceCoefficientUsingBinaryForm($FingerprintsVectorA, $FingerprintsVectorB); 965 } 966 elsif ($CalculationMode =~ /^SetTheoreticForm$/i) { 967 return _CityBlockDistanceCoefficientUsingSetTheoreticForm($FingerprintsVectorA, $FingerprintsVectorB); 968 } 969 else { 970 return undef; 971 } 972 } 973 974 # Calculate CityBlock distance coefficient using algebraic form... 975 # 976 sub _CityBlockDistanceCoefficientUsingAlgebraicForm { 977 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 978 my($SumAbsSubtractionXaiXbi); 979 980 $SumAbsSubtractionXaiXbi = _GetSumOfAbsoluteValueOfSubtractionOfFingerprintsOrderedValues($FingerprintsVectorA, $FingerprintsVectorB); 981 982 return $SumAbsSubtractionXaiXbi; 983 } 984 985 # Calculate CityBlock distance coefficient using binary form... 986 # 987 sub _CityBlockDistanceCoefficientUsingBinaryForm { 988 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 989 my($Na, $Nb, $Nc); 990 991 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsVectorA, $FingerprintsVectorB); 992 993 return ($Na + $Nb - 2 * $Nc); 994 } 995 996 # Calculate CityBlock distance coefficient using set theoretic form... 997 # 998 sub _CityBlockDistanceCoefficientUsingSetTheoreticForm { 999 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1000 my($SumMinXaiXbi, $SumXai, $SumXbi); 1001 1002 $SumXai = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorA); 1003 $SumXbi = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorB); 1004 $SumMinXaiXbi = _GetSumOfMinimumOfFingerprintsOrderdedValues($FingerprintsVectorA, $FingerprintsVectorB); 1005 1006 return ($SumXai + $SumXbi - 2 * $SumMinXaiXbi); 1007 } 1008 1009 # Calculate Ochiai similarity cofficient between two fingerprint vectors. 1010 # 1011 # This functionality can be either invoked as a class function or an object method. 1012 # 1013 sub OchiaiSimilarityCoefficient ($$;$$) { 1014 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1015 1016 return CosineSimilarityCoefficient($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1017 } 1018 1019 # Calculate Cosine similarity cofficient between two fingerprint vectors. 1020 # 1021 # This functionality can be either invoked as a class function or an object method. 1022 # 1023 sub CosineSimilarityCoefficient ($$;$$) { 1024 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1025 1026 $CalculationMode = defined $CalculationMode ? $CalculationMode : 'AlgebraicForm'; 1027 $SkipValuesCheck = defined $SkipValuesCheck ? $SkipValuesCheck : 0; 1028 1029 # Validate and process fingerprints vectors for similarity calculations... 1030 # 1031 _ValidateAndProcessFingerprintsVectorsForSimilarityCalculation("CosineSimilarityCoefficient: Calculation failed", $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1032 1033 # Perform the calculation... 1034 if ($CalculationMode =~ /^AlgebraicForm$/i) { 1035 return _CosineSimilarityCoefficientUsingAlgebraicForm($FingerprintsVectorA, $FingerprintsVectorB); 1036 } 1037 elsif ($CalculationMode =~ /^BinaryForm$/i) { 1038 return _CosineSimilarityCoefficientUsingBinaryForm($FingerprintsVectorA, $FingerprintsVectorB); 1039 } 1040 elsif ($CalculationMode =~ /^SetTheoreticForm$/i) { 1041 return _CosineSimilarityCoefficientUsingSetTheoreticForm($FingerprintsVectorA, $FingerprintsVectorB); 1042 } 1043 else { 1044 return undef; 1045 } 1046 } 1047 1048 # Calculate Cosine similarity coefficient using algebraic form... 1049 # 1050 sub _CosineSimilarityCoefficientUsingAlgebraicForm { 1051 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1052 my($SumProductXaiXbi, $SumXai2, $SumXbi2, $Numerator, $Denominator); 1053 1054 $SumXai2 = _GetSumOfSquaresOfFingerprintsOrderedValues($FingerprintsVectorA); 1055 $SumXbi2 = _GetSumOfSquaresOfFingerprintsOrderedValues($FingerprintsVectorB); 1056 $SumProductXaiXbi = _GetSumOfProductOfFingerprintsOrderedValues($FingerprintsVectorA, $FingerprintsVectorB); 1057 1058 $Numerator = $SumProductXaiXbi; 1059 $Denominator = sqrt($SumXai2 * $SumXbi2); 1060 1061 return $Denominator ? ($Numerator/$Denominator) : 0; 1062 } 1063 1064 # CalculateCosine similarity coefficient using binary form... 1065 # 1066 sub _CosineSimilarityCoefficientUsingBinaryForm { 1067 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1068 my($Na, $Nb, $Nc, $Numerator, $Denominator); 1069 1070 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsVectorA, $FingerprintsVectorB); 1071 1072 $Numerator = $Nc; 1073 $Denominator = sqrt($Na * $Nb); 1074 1075 return $Denominator ? ($Numerator/$Denominator) : 0; 1076 } 1077 1078 # Calculate Cosine similarity coefficient using set theoretic form... 1079 # 1080 sub _CosineSimilarityCoefficientUsingSetTheoreticForm { 1081 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1082 my($SumMinXaiXbi, $SumXai, $SumXbi, $Numerator, $Denominator); 1083 1084 $SumXai = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorA); 1085 $SumXbi = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorB); 1086 $SumMinXaiXbi = _GetSumOfMinimumOfFingerprintsOrderdedValues($FingerprintsVectorA, $FingerprintsVectorB); 1087 1088 $Numerator = $SumMinXaiXbi; 1089 $Denominator = sqrt($SumXai * $SumXbi); 1090 1091 return $Denominator ? ($Numerator/$Denominator) : 0; 1092 } 1093 1094 # Calculate Czekanowski similarity cofficient between two fingerprint vectors. 1095 # 1096 # This functionality can be either invoked as a class function or an object method. 1097 # 1098 sub CzekanowskiSimilarityCoefficient ($$;$$) { 1099 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1100 1101 return DiceSimilarityCoefficient($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1102 } 1103 1104 # Calculate Sorenson similarity cofficient between two fingerprint vectors. 1105 # 1106 # This functionality can be either invoked as a class function or an object method. 1107 # 1108 sub SorensonSimilarityCoefficient ($$;$$) { 1109 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1110 1111 return DiceSimilarityCoefficient($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1112 } 1113 1114 # Calculate Dice similarity cofficient between two fingerprint vectors. 1115 # 1116 # This functionality can be either invoked as a class function or an object method. 1117 # 1118 sub DiceSimilarityCoefficient ($$;$$) { 1119 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1120 1121 $CalculationMode = defined $CalculationMode ? $CalculationMode : 'AlgebraicForm'; 1122 $SkipValuesCheck = defined $SkipValuesCheck ? $SkipValuesCheck : 0; 1123 1124 # Validate and process fingerprints vectors for similarity calculations... 1125 # 1126 _ValidateAndProcessFingerprintsVectorsForSimilarityCalculation("DiceSimilarityCoefficient: Calculation failed", $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1127 1128 # Perform the calculation... 1129 if ($CalculationMode =~ /^AlgebraicForm$/i) { 1130 return _DiceSimilarityCoefficientUsingAlgebraicForm($FingerprintsVectorA, $FingerprintsVectorB); 1131 } 1132 elsif ($CalculationMode =~ /^BinaryForm$/i) { 1133 return _DiceSimilarityCoefficientUsingBinaryForm($FingerprintsVectorA, $FingerprintsVectorB); 1134 } 1135 elsif ($CalculationMode =~ /^SetTheoreticForm$/i) { 1136 return _DiceSimilarityCoefficientUsingSetTheoreticForm($FingerprintsVectorA, $FingerprintsVectorB); 1137 } 1138 else { 1139 return undef; 1140 } 1141 } 1142 1143 # Calculate Dice similarity coefficient using algebraic form... 1144 # 1145 sub _DiceSimilarityCoefficientUsingAlgebraicForm { 1146 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1147 my($SumProductXaiXbi, $SumXai2, $SumXbi2, $Numerator, $Denominator); 1148 1149 $SumXai2 = _GetSumOfSquaresOfFingerprintsOrderedValues($FingerprintsVectorA); 1150 $SumXbi2 = _GetSumOfSquaresOfFingerprintsOrderedValues($FingerprintsVectorB); 1151 $SumProductXaiXbi = _GetSumOfProductOfFingerprintsOrderedValues($FingerprintsVectorA, $FingerprintsVectorB); 1152 1153 $Numerator = 2 * $SumProductXaiXbi; 1154 $Denominator = $SumXai2 + $SumXbi2; 1155 1156 return $Denominator ? ($Numerator/$Denominator) : 0; 1157 } 1158 1159 # Calculate Dice similarity coefficient using binary form... 1160 # 1161 sub _DiceSimilarityCoefficientUsingBinaryForm { 1162 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1163 my($Na, $Nb, $Nc, $Numerator, $Denominator); 1164 1165 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsVectorA, $FingerprintsVectorB); 1166 1167 $Numerator = 2 * $Nc; 1168 $Denominator = $Na + $Nb; 1169 1170 return $Denominator ? ($Numerator/$Denominator) : 0; 1171 } 1172 1173 # Calculate Dice similarity coefficient using set theoretic form... 1174 # 1175 sub _DiceSimilarityCoefficientUsingSetTheoreticForm { 1176 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1177 my($SumMinXaiXbi, $SumXai, $SumXbi, $Numerator, $Denominator); 1178 1179 $SumXai = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorA); 1180 $SumXbi = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorB); 1181 $SumMinXaiXbi = _GetSumOfMinimumOfFingerprintsOrderdedValues($FingerprintsVectorA, $FingerprintsVectorB); 1182 1183 $Numerator = 2 * $SumMinXaiXbi; 1184 $Denominator = $SumXai + $SumXbi; 1185 1186 return $Denominator ? ($Numerator/$Denominator) : 0; 1187 } 1188 1189 1190 # Calculate Euclidean distance coefficient between two fingerprint vectors. 1191 # 1192 # This functionality can be either invoked as a class function or an object method. 1193 # 1194 sub EuclideanDistanceCoefficient ($$;$$) { 1195 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1196 1197 $CalculationMode = defined $CalculationMode ? $CalculationMode : 'AlgebraicForm'; 1198 $SkipValuesCheck = defined $SkipValuesCheck ? $SkipValuesCheck : 0; 1199 1200 # Validate and process fingerprints vectors for similarity calculations... 1201 # 1202 _ValidateAndProcessFingerprintsVectorsForSimilarityCalculation("EuclideanDistanceCoefficient: Calculation failed", $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1203 1204 # Perform the calculation... 1205 if ($CalculationMode =~ /^AlgebraicForm$/i) { 1206 return _EuclideanDistanceCoefficientUsingAlgebraicForm($FingerprintsVectorA, $FingerprintsVectorB); 1207 } 1208 elsif ($CalculationMode =~ /^BinaryForm$/i) { 1209 return _EuclideanDistanceCoefficientUsingBinaryForm($FingerprintsVectorA, $FingerprintsVectorB); 1210 } 1211 elsif ($CalculationMode =~ /^SetTheoreticForm$/i) { 1212 return _EuclideanDistanceCoefficientUsingSetTheoreticForm($FingerprintsVectorA, $FingerprintsVectorB); 1213 } 1214 else { 1215 return undef; 1216 } 1217 } 1218 1219 # Calculate Euclidean distance coefficient using algebraic form... 1220 # 1221 sub _EuclideanDistanceCoefficientUsingAlgebraicForm { 1222 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1223 my($SumSquaresSubtractionXaiXbi); 1224 1225 $SumSquaresSubtractionXaiXbi = _GetSumOfSquaresOfSubtractionOfFingerprintsOrderedValues($FingerprintsVectorA, $FingerprintsVectorB); 1226 1227 return sqrt($SumSquaresSubtractionXaiXbi); 1228 } 1229 1230 # Calculate Euclidean distance coefficient using binary form... 1231 # 1232 sub _EuclideanDistanceCoefficientUsingBinaryForm { 1233 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1234 my($Na, $Nb, $Nc); 1235 1236 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsVectorA, $FingerprintsVectorB); 1237 1238 return (sqrt($Na + $Nb - 2 * $Nc)); 1239 } 1240 1241 # Calculate Euclidean distance coefficient using set theoretic form... 1242 # 1243 sub _EuclideanDistanceCoefficientUsingSetTheoreticForm { 1244 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1245 my($SumMinXaiXbi, $SumXai, $SumXbi); 1246 1247 $SumXai = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorA); 1248 $SumXbi = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorB); 1249 $SumMinXaiXbi = _GetSumOfMinimumOfFingerprintsOrderdedValues($FingerprintsVectorA, $FingerprintsVectorB); 1250 1251 return (sqrt($SumXai + $SumXbi - 2 * $SumMinXaiXbi)); 1252 } 1253 1254 # Calculate Jaccard similarity cofficient between two fingerprint vectors. 1255 # 1256 # This functionality can be either invoked as a class function or an object method. 1257 # 1258 sub JaccardSimilarityCoefficient ($$;$$) { 1259 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1260 1261 return TanimotoSimilarityCoefficient($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1262 } 1263 1264 # Calculate Tanimoto similarity cofficient between two fingerprint vectors. 1265 # 1266 # This functionality can be either invoked as a class function or an object method. 1267 # 1268 sub TanimotoSimilarityCoefficient ($$;$$) { 1269 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1270 1271 $CalculationMode = defined $CalculationMode ? $CalculationMode : 'AlgebraicForm'; 1272 $SkipValuesCheck = defined $SkipValuesCheck ? $SkipValuesCheck : 0; 1273 1274 # Validate and process fingerprints vectors for similarity calculations... 1275 # 1276 _ValidateAndProcessFingerprintsVectorsForSimilarityCalculation("TanimotoSimilarityCoefficient: Calculation failed", $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1277 1278 # Perform the calculation... 1279 if ($CalculationMode =~ /^AlgebraicForm$/i) { 1280 return _TanimotoSimilarityCoefficientUsingAlgebraicForm($FingerprintsVectorA, $FingerprintsVectorB); 1281 } 1282 elsif ($CalculationMode =~ /^BinaryForm$/i) { 1283 return _TanimotoSimilarityCoefficientUsingBinaryForm($FingerprintsVectorA, $FingerprintsVectorB); 1284 } 1285 elsif ($CalculationMode =~ /^SetTheoreticForm$/i) { 1286 return _TanimotoSimilarityCoefficientUsingSetTheoreticForm($FingerprintsVectorA, $FingerprintsVectorB); 1287 } 1288 else { 1289 return undef; 1290 } 1291 } 1292 1293 # Calculate Tanimoto similarity coefficient using algebraic form... 1294 # 1295 sub _TanimotoSimilarityCoefficientUsingAlgebraicForm { 1296 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1297 my($SumProductXaiXbi, $SumXai2, $SumXbi2, $Numerator, $Denominator); 1298 1299 $SumXai2 = _GetSumOfSquaresOfFingerprintsOrderedValues($FingerprintsVectorA); 1300 $SumXbi2 = _GetSumOfSquaresOfFingerprintsOrderedValues($FingerprintsVectorB); 1301 $SumProductXaiXbi = _GetSumOfProductOfFingerprintsOrderedValues($FingerprintsVectorA, $FingerprintsVectorB); 1302 1303 $Numerator = $SumProductXaiXbi; 1304 $Denominator = $SumXai2 + $SumXbi2 - $SumProductXaiXbi; 1305 1306 return $Denominator ? ($Numerator/$Denominator) : 0; 1307 } 1308 1309 # Calculate Tanimoto similarity coefficient using binary form... 1310 # 1311 sub _TanimotoSimilarityCoefficientUsingBinaryForm { 1312 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1313 my($Na, $Nb, $Nc, $Numerator, $Denominator); 1314 1315 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsVectorA, $FingerprintsVectorB); 1316 1317 $Numerator = $Nc; 1318 $Denominator = $Na + $Nb - $Nc; 1319 1320 return $Denominator ? ($Numerator/$Denominator) : 0; 1321 } 1322 1323 # Calculate Tanimoto similarity coefficient using set theoretic form... 1324 # 1325 sub _TanimotoSimilarityCoefficientUsingSetTheoreticForm { 1326 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1327 my($SumMinXaiXbi, $SumXai, $SumXbi, $Numerator, $Denominator); 1328 1329 $SumXai = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorA); 1330 $SumXbi = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorB); 1331 $SumMinXaiXbi = _GetSumOfMinimumOfFingerprintsOrderdedValues($FingerprintsVectorA, $FingerprintsVectorB); 1332 1333 $Numerator = $SumMinXaiXbi; 1334 $Denominator = $SumXai + $SumXbi - $SumMinXaiXbi; 1335 1336 return $Denominator ? ($Numerator/$Denominator) : 0; 1337 } 1338 1339 1340 # Calculate Soergel distance coefficient between two fingerprint vectors. 1341 # 1342 # This functionality can be either invoked as a class function or an object method. 1343 # 1344 sub SoergelDistanceCoefficient ($$;$$) { 1345 my($FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1346 1347 $CalculationMode = defined $CalculationMode ? $CalculationMode : 'AlgebraicForm'; 1348 $SkipValuesCheck = defined $SkipValuesCheck ? $SkipValuesCheck : 0; 1349 1350 # Validate and process fingerprints vectors for similarity calculations... 1351 # 1352 _ValidateAndProcessFingerprintsVectorsForSimilarityCalculation("SoergelDistanceCoefficient: Calculation failed", $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck); 1353 1354 # Perform the calculation... 1355 if ($CalculationMode =~ /^AlgebraicForm$/i) { 1356 return _SoergelDistanceCoefficientUsingAlgebraicForm($FingerprintsVectorA, $FingerprintsVectorB); 1357 } 1358 elsif ($CalculationMode =~ /^BinaryForm$/i) { 1359 return _SoergelDistanceCoefficientUsingBinaryForm($FingerprintsVectorA, $FingerprintsVectorB); 1360 } 1361 elsif ($CalculationMode =~ /^SetTheoreticForm$/i) { 1362 return _SoergelDistanceCoefficientUsingSetTheoreticForm($FingerprintsVectorA, $FingerprintsVectorB); 1363 } 1364 else { 1365 return undef; 1366 } 1367 } 1368 1369 # Calculate Soergel distance coefficientusing algebraic form... 1370 # 1371 sub _SoergelDistanceCoefficientUsingAlgebraicForm { 1372 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1373 my($SumAbsSubtractionXaiXbi, $SumMaxXaiXbi, $Numerator, $Denominator); 1374 1375 $SumAbsSubtractionXaiXbi = _GetSumOfAbsoluteValueOfSubtractionOfFingerprintsOrderedValues($FingerprintsVectorA, $FingerprintsVectorB); 1376 $SumMaxXaiXbi = _GetSumOfMaximumOfFingerprintsOrderdedValues($FingerprintsVectorA, $FingerprintsVectorB); 1377 1378 $Numerator = $SumAbsSubtractionXaiXbi; 1379 $Denominator = $SumMaxXaiXbi; 1380 1381 return $Denominator ? ($Numerator/$Denominator) : 0; 1382 } 1383 1384 # Calculate Soergel distance coefficient using binary form... 1385 # 1386 sub _SoergelDistanceCoefficientUsingBinaryForm { 1387 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1388 my($Na, $Nb, $Nc, $Numerator, $Denominator); 1389 1390 ($Na, $Nb, $Nc) = _GetNumOfIndividualAndCommonSetBits($FingerprintsVectorA, $FingerprintsVectorB); 1391 1392 $Numerator = $Na + $Nb - 2 * $Nc; 1393 $Denominator = $Na + $Nb - $Nc; 1394 1395 return $Denominator ? ($Numerator/$Denominator) : 0; 1396 } 1397 1398 # Calculate SoergelDistanceCoefficient using set theoretic form... 1399 # 1400 sub _SoergelDistanceCoefficientUsingSetTheoreticForm { 1401 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1402 my($SumMinXaiXbi, $SumXai, $SumXbi, $Numerator, $Denominator); 1403 1404 $SumXai = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorA); 1405 $SumXbi = _GetSumOfFingerprintsOrderedValues($FingerprintsVectorB); 1406 $SumMinXaiXbi = _GetSumOfMinimumOfFingerprintsOrderdedValues($FingerprintsVectorA, $FingerprintsVectorB); 1407 1408 $Numerator = $SumXai + $SumXbi - 2 * $SumMinXaiXbi; 1409 $Denominator = $SumXai + $SumXbi - $SumMinXaiXbi; 1410 1411 return $Denominator ? ($Numerator/$Denominator) : 0; 1412 } 1413 1414 # Validate and process fingerprints vectors for similarity calculations... 1415 # 1416 sub _ValidateAndProcessFingerprintsVectorsForSimilarityCalculation { 1417 my($ErrorMsg, $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode, $SkipValuesCheck) = @_; 1418 1419 $CalculationMode = defined $CalculationMode ? $CalculationMode : 'AlgebraicForm'; 1420 $SkipValuesCheck = defined $SkipValuesCheck ? $SkipValuesCheck : 0; 1421 1422 if (!$SkipValuesCheck) { 1423 _ValidateFingerprintsVectorsForSimilarityCalculation($ErrorMsg, $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode); 1424 } 1425 _ProcessFingerprintsVectorsForSimilarityCalculation($ErrorMsg, $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode); 1426 } 1427 1428 # Make sure fingerprint vectors are good for performing similarity/distance calculation... 1429 # 1430 sub _ValidateFingerprintsVectorsForSimilarityCalculation { 1431 my($ErrorMsg, $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode) = @_; 1432 1433 # Make sure both are fingerprint vectors.. 1434 if (!(IsFingerprintsVector($FingerprintsVectorA) && IsFingerprintsVector($FingerprintsVectorB))) { 1435 croak "Error: ${ClassName}->${ErrorMsg}: Both objects must be fingerprint vectors..."; 1436 } 1437 1438 # Check types... 1439 if ($FingerprintsVectorA->{Type} ne $FingerprintsVectorB->{Type}) { 1440 croak "Error: ${ClassName}->${ErrorMsg}: Type of first fingerprint vector, $FingerprintsVectorA->{Type}, must be same as type of second fingerprint vector, $FingerprintsVectorB->{Type}..."; 1441 } 1442 1443 # Check calculation mode... 1444 if ($CalculationMode !~ /^(AlgebraicForm|BinaryForm|SetTheoreticForm)$/i) { 1445 croak "Error: ${ClassName}->${ErrorMsg}: Specified similarity calculation mode, $CalculationMode, is not valid. Supported values: AlgebraicForm, BinaryForm, and SetTheoreticForm..."; 1446 } 1447 1448 # Check values and value IDs... 1449 my($Na, $Nb, $NIDa, $NIDb); 1450 $Na = $FingerprintsVectorA->GetNumOfValues(); $Nb = $FingerprintsVectorB->GetNumOfValues(); 1451 $NIDa = $FingerprintsVectorA->GetNumOfValueIDs(); $NIDb = $FingerprintsVectorB->GetNumOfValueIDs(); 1452 1453 if ($Na == 0) { 1454 croak "Error: ${ClassName}->${ErrorMsg}: Number of values in first fingerprint vector, $Na, must be > 0 for fingerprint vector type $FingerprintsVectorA->{Type} ..."; 1455 } 1456 if ($Nb == 0) { 1457 croak "Error: ${ClassName}->${ErrorMsg}: Number of values in second fingerprint vector, $Nb, must be > 0 for fingerprint vector type $FingerprintsVectorB->{Type} ..."; 1458 } 1459 1460 if ($FingerprintsVectorA->{Type} =~ /^OrderedNumericalValues$/i) { 1461 if ($Na != $Nb) { 1462 croak "Error: ${ClassName}->${ErrorMsg}: Number of values in first fingerprint vector, $Na, must be equal to number of values, $Nb, in second fingerprint vector for fingerprint vector types $FingerprintsVectorA->{Type} ..."; 1463 } 1464 } 1465 elsif ($FingerprintsVectorA->{Type} =~ /^NumericalValues$/i) { 1466 if ($NIDa == 0) { 1467 croak "Error: ${ClassName}->${ErrorMsg}: Number of value IDs in first fingerprint vector, $NIDa, must be > 0 for fingerprint vector type $FingerprintsVectorA->{Type} ..."; 1468 } 1469 if ($NIDb == 0) { 1470 croak "Error: ${ClassName}->${ErrorMsg}: Number of value IDs in first fingerprint vector, $NIDb, must be > 0 for fingerprint vector type $FingerprintsVectorB->{Type} ..."; 1471 } 1472 1473 if ($NIDa != $Na) { 1474 croak "Error: ${ClassName}->${ErrorMsg}: Number of value IDs in first fingerprint vector, $NIDa, must be equal to its number of values, $Na, for fingerprint vector type $FingerprintsVectorA->{Type} ..."; 1475 } 1476 if ($NIDb != $Nb) { 1477 croak "Error: ${ClassName}->${ErrorMsg}: Number of value IDs in second fingerprint vector, $NIDb, must be equal to its number of values, $Nb, for fingerprint vector type $FingerprintsVectorA->{Type} ..."; 1478 } 1479 } 1480 elsif ($FingerprintsVectorA->{Type} =~ /^AlphaNumericalValues$/i) { 1481 if ($NIDa || $NIDb) { 1482 croak "Error: ${ClassName}->${ErrorMsg}: ValueIDs cann't be specified for fingerprint vector types $FingerprintsVectorA->{Type} ..."; 1483 } 1484 } 1485 else { 1486 croak "Error: ${ClassName}->${ErrorMsg}: Fingerprint vector types $FingerprintsVectorA->{Type} is not valid..."; 1487 } 1488 } 1489 1490 # Process fingerprints vectors for similarity calculation by generating vectors 1491 # containing ordered list of values... 1492 # 1493 sub _ProcessFingerprintsVectorsForSimilarityCalculation { 1494 my($ErrorMsg, $FingerprintsVectorA, $FingerprintsVectorB, $CalculationMode) = @_; 1495 1496 $FingerprintsVectorA->{OrderedValuesRef} = undef; $FingerprintsVectorB->{OrderedValuesRef} = undef; 1497 $FingerprintsVectorA->{BitVector} = undef; $FingerprintsVectorB->{BitVector} = undef; 1498 1499 if ($FingerprintsVectorA->{Type} =~ /^OrderedNumericalValues$/i) { 1500 _ProcessOrderedNumericalValuesFingerprintsVectorsForSimilarityCalculation($FingerprintsVectorA, $FingerprintsVectorB); 1501 } 1502 elsif ($FingerprintsVectorA->{Type} =~ /^NumericalValues$/i) { 1503 _ProcessNumericalValuesFingerprintsVectorsForSimilarityCalculation($FingerprintsVectorA, $FingerprintsVectorB); 1504 } 1505 elsif ($FingerprintsVectorA->{Type} =~ /^AlphaNumericalValues$/i) { 1506 _ProcessAlphaNumericalValuesFingerprintsVectorsForSimilarityCalculation($FingerprintsVectorA, $FingerprintsVectorB); 1507 } 1508 else { 1509 croak "Error: ${ClassName}->${ErrorMsg}: Fingerprint vector types $FingerprintsVectorA->{Type} is not valid..."; 1510 } 1511 if ($CalculationMode =~ /^BinaryForm$/i) { 1512 _TransformFinalOrderedValuesIntoBitVectorsForSimilarityCalculation($FingerprintsVectorA, $FingerprintsVectorB); 1513 } 1514 } 1515 1516 # Process fingerprints vectors with ordered numerical values for similarity calculations... 1517 # 1518 sub _ProcessOrderedNumericalValuesFingerprintsVectorsForSimilarityCalculation { 1519 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1520 1521 $FingerprintsVectorA->{OrderedValuesRef} = \@{$FingerprintsVectorA->{Values}}; 1522 $FingerprintsVectorB->{OrderedValuesRef} = \@{$FingerprintsVectorB->{Values}}; 1523 } 1524 1525 # Process fingerprints vectors with numerical values for similarity calculations... 1526 # 1527 sub _ProcessNumericalValuesFingerprintsVectorsForSimilarityCalculation { 1528 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1529 1530 # Set up unique IDs and values map for each fingerprint vector... 1531 my($Index, $Value, $ValueID, %UniqueFingerprintsVectorAValueIDValues, %UniqueFingerprintsVectorBValueIDValues, %UniqueFingerprintsVectorsValueIDs); 1532 1533 %UniqueFingerprintsVectorAValueIDValues = (); 1534 %UniqueFingerprintsVectorBValueIDValues = (); 1535 %UniqueFingerprintsVectorsValueIDs = (); 1536 1537 # Go over first vector... 1538 for $Index (0 .. $#{$FingerprintsVectorA->{ValueIDs}}) { 1539 $ValueID = $FingerprintsVectorA->{ValueIDs}[$Index]; 1540 $Value = $FingerprintsVectorA->{Values}[$Index]; 1541 if (exists $UniqueFingerprintsVectorAValueIDValues{$ValueID}) { 1542 $UniqueFingerprintsVectorAValueIDValues{$ValueID} += $Value; 1543 } 1544 else { 1545 $UniqueFingerprintsVectorAValueIDValues{$ValueID} = $Value; 1546 } 1547 if (!exists $UniqueFingerprintsVectorsValueIDs{$ValueID}) { 1548 $UniqueFingerprintsVectorsValueIDs{$ValueID} = 1; 1549 } 1550 } 1551 1552 # Go over second vector... 1553 for $Index (0 .. $#{$FingerprintsVectorB->{ValueIDs}}) { 1554 $ValueID = $FingerprintsVectorB->{ValueIDs}[$Index]; 1555 $Value = $FingerprintsVectorB->{Values}[$Index]; 1556 if (exists $UniqueFingerprintsVectorBValueIDValues{$ValueID}) { 1557 $UniqueFingerprintsVectorBValueIDValues{$ValueID} += $Value; 1558 } 1559 else { 1560 $UniqueFingerprintsVectorBValueIDValues{$ValueID} = $Value; 1561 } 1562 if (!exists $UniqueFingerprintsVectorsValueIDs{$ValueID}) { 1563 $UniqueFingerprintsVectorsValueIDs{$ValueID} = 1; 1564 } 1565 } 1566 1567 # Setup ordered values... 1568 my(@UniqueOrderedValueIDs, @OrderedValuesA, @OrderedValuesB); 1569 1570 @UniqueOrderedValueIDs = (); 1571 @UniqueOrderedValueIDs = sort keys %UniqueFingerprintsVectorsValueIDs; 1572 1573 @OrderedValuesA = (); 1574 @OrderedValuesA = map { exists $UniqueFingerprintsVectorAValueIDValues{$_} ? $UniqueFingerprintsVectorAValueIDValues{$_} : 0 } @UniqueOrderedValueIDs; 1575 1576 @OrderedValuesB = (); 1577 @OrderedValuesB = map { exists $UniqueFingerprintsVectorBValueIDValues{$_} ? $UniqueFingerprintsVectorBValueIDValues{$_} : 0 } @UniqueOrderedValueIDs; 1578 1579 $FingerprintsVectorA->{OrderedValuesRef} = \@OrderedValuesA; 1580 $FingerprintsVectorB->{OrderedValuesRef} = \@OrderedValuesB; 1581 } 1582 1583 # Process fingerprints vectors with allpha numerical values for similarity calculations... 1584 # 1585 sub _ProcessAlphaNumericalValuesFingerprintsVectorsForSimilarityCalculation { 1586 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1587 1588 # Set up unique IDs and values map for each vector... 1589 my($Index, $Value, $ValueID, %UniqueFingerprintsVectorAValuesCount, %UniqueFingerprintsVectorBValuesCount, %UniqueFingerprintsVectorsValues); 1590 1591 %UniqueFingerprintsVectorAValuesCount = (); 1592 %UniqueFingerprintsVectorBValuesCount = (); 1593 %UniqueFingerprintsVectorsValues = (); 1594 1595 # Go over first vector... 1596 for $Value (@{$FingerprintsVectorA->{Values}}) { 1597 if (exists $UniqueFingerprintsVectorAValuesCount{$Value}) { 1598 $UniqueFingerprintsVectorAValuesCount{$Value} += 1; 1599 } 1600 else { 1601 $UniqueFingerprintsVectorAValuesCount{$Value} = 1; 1602 } 1603 if (!exists $UniqueFingerprintsVectorsValues{$Value}) { 1604 $UniqueFingerprintsVectorsValues{$Value} = 1; 1605 } 1606 } 1607 1608 # Go over second vector... 1609 for $Value (@{$FingerprintsVectorB->{Values}}) { 1610 if (exists $UniqueFingerprintsVectorBValuesCount{$Value}) { 1611 $UniqueFingerprintsVectorBValuesCount{$Value} += 1; 1612 } 1613 else { 1614 $UniqueFingerprintsVectorBValuesCount{$Value} = 1; 1615 } 1616 if (!exists $UniqueFingerprintsVectorsValues{$Value}) { 1617 $UniqueFingerprintsVectorsValues{$Value} = 1; 1618 } 1619 } 1620 1621 # Setup ordered values... 1622 my(@UniqueOrderedValueIDs, @OrderedValuesA, @OrderedValuesB); 1623 1624 @UniqueOrderedValueIDs = (); 1625 @UniqueOrderedValueIDs = sort keys %UniqueFingerprintsVectorsValues; 1626 1627 @OrderedValuesA = (); 1628 @OrderedValuesA = map { exists $UniqueFingerprintsVectorAValuesCount{$_} ? $UniqueFingerprintsVectorAValuesCount{$_} : 0 } @UniqueOrderedValueIDs; 1629 1630 @OrderedValuesB = (); 1631 @OrderedValuesB = map { exists $UniqueFingerprintsVectorBValuesCount{$_} ? $UniqueFingerprintsVectorBValuesCount{$_} : 0 } @UniqueOrderedValueIDs; 1632 1633 $FingerprintsVectorA->{OrderedValuesRef} = \@OrderedValuesA; 1634 $FingerprintsVectorB->{OrderedValuesRef} = \@OrderedValuesB; 1635 1636 } 1637 1638 # Transform final ordered values array into a BitVector for similarity calculation... 1639 # 1640 sub _TransformFinalOrderedValuesIntoBitVectorsForSimilarityCalculation { 1641 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1642 my($Index, $Size, $BitVectorA, $BitVectorB, $SkipCheck); 1643 1644 # Create bit vectors... 1645 $Size = scalar @{$FingerprintsVectorA->{OrderedValuesRef}}; 1646 1647 $FingerprintsVectorA->{BitVector} = new BitVector($Size); 1648 $FingerprintsVectorB->{BitVector} = new BitVector($Size); 1649 1650 # Set bits... 1651 $SkipCheck = 1; 1652 for $Index (0 .. ($Size - 1)) { 1653 if ($FingerprintsVectorA->{OrderedValuesRef}[$Index]) { 1654 $FingerprintsVectorA->{BitVector}->SetBit($Index, $SkipCheck); 1655 } 1656 if ($FingerprintsVectorB->{OrderedValuesRef}[$Index]) { 1657 $FingerprintsVectorB->{BitVector}->SetBit($Index, $SkipCheck); 1658 } 1659 } 1660 } 1661 1662 # Return sum of ordered vector values... 1663 # 1664 sub _GetSumOfFingerprintsOrderedValues { 1665 my($FingerprintVector) = @_; 1666 1667 return StatisticsUtil::Sum($FingerprintVector->{OrderedValuesRef}); 1668 } 1669 1670 # Return sum of squared ordered vector values... 1671 # 1672 sub _GetSumOfSquaresOfFingerprintsOrderedValues { 1673 my($FingerprintVector) = @_; 1674 1675 return StatisticsUtil::SumOfSquares($FingerprintVector->{OrderedValuesRef}); 1676 } 1677 1678 # Return sum of product of correponding ordered vector values... 1679 # 1680 sub _GetSumOfProductOfFingerprintsOrderedValues { 1681 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1682 my($Index, $SumProductXaiXbi); 1683 1684 $SumProductXaiXbi = 0; 1685 for $Index (0 .. $#{$FingerprintsVectorA->{OrderedValuesRef}}) { 1686 $SumProductXaiXbi += $FingerprintsVectorA->{OrderedValuesRef}[$Index] * $FingerprintsVectorB->{OrderedValuesRef}[$Index]; 1687 } 1688 return $SumProductXaiXbi; 1689 } 1690 1691 # Return sum of absolute value of subtraction of correponding ordered vector values... 1692 # 1693 sub _GetSumOfAbsoluteValueOfSubtractionOfFingerprintsOrderedValues { 1694 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1695 my($Index, $SumAbsSubtractionXaiXbi); 1696 1697 $SumAbsSubtractionXaiXbi = 0; 1698 for $Index (0 .. $#{$FingerprintsVectorA->{OrderedValuesRef}}) { 1699 $SumAbsSubtractionXaiXbi += abs($FingerprintsVectorA->{OrderedValuesRef}[$Index] - $FingerprintsVectorB->{OrderedValuesRef}[$Index]); 1700 } 1701 return $SumAbsSubtractionXaiXbi; 1702 } 1703 1704 # Return sum of squares of subtraction of correponding ordered vector values... 1705 # 1706 sub _GetSumOfSquaresOfSubtractionOfFingerprintsOrderedValues { 1707 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1708 my($Index, $SumSquaresSubtractionXaiXbi); 1709 1710 $SumSquaresSubtractionXaiXbi = 0; 1711 for $Index (0 .. $#{$FingerprintsVectorA->{OrderedValuesRef}}) { 1712 $SumSquaresSubtractionXaiXbi += ($FingerprintsVectorA->{OrderedValuesRef}[$Index] - $FingerprintsVectorB->{OrderedValuesRef}[$Index])**2; 1713 } 1714 return $SumSquaresSubtractionXaiXbi; 1715 } 1716 1717 # Return sum of minimum of correponding ordered vector values... 1718 # 1719 sub _GetSumOfMinimumOfFingerprintsOrderdedValues { 1720 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1721 my($Index, $SumMinXaiXbi); 1722 1723 $SumMinXaiXbi = 0; 1724 for $Index (0 .. $#{$FingerprintsVectorA->{OrderedValuesRef}}) { 1725 $SumMinXaiXbi += MathUtil::min($FingerprintsVectorA->{OrderedValuesRef}[$Index], $FingerprintsVectorB->{OrderedValuesRef}[$Index]); 1726 } 1727 return $SumMinXaiXbi; 1728 } 1729 1730 # Return sum of maximum of correponding ordered vector values... 1731 # 1732 sub _GetSumOfMaximumOfFingerprintsOrderdedValues { 1733 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1734 my($Index, $SumMaxXaiXbi); 1735 1736 $SumMaxXaiXbi = 0; 1737 for $Index (0 .. $#{$FingerprintsVectorA->{OrderedValuesRef}}) { 1738 $SumMaxXaiXbi += MathUtil::max($FingerprintsVectorA->{OrderedValuesRef}[$Index], $FingerprintsVectorB->{OrderedValuesRef}[$Index]); 1739 } 1740 return $SumMaxXaiXbi; 1741 } 1742 1743 # Get number of Na, Nb and Nc bits in vector A and B for BinaryForm calculation... 1744 # 1745 sub _GetNumOfIndividualAndCommonSetBits ($$) { 1746 my($FingerprintsVectorA, $FingerprintsVectorB) = @_; 1747 my($Na, $Nb, $Nc, $Nd, $FingerprintsBitVectorA, $FingerprintsBitVectorB); 1748 1749 $FingerprintsBitVectorA = $FingerprintsVectorA->{BitVector}; 1750 $FingerprintsBitVectorB = $FingerprintsVectorB->{BitVector}; 1751 1752 # Number of bits set to "1" in A 1753 $Na = $FingerprintsBitVectorA->GetNumOfSetBits(); 1754 1755 # Number of bits set to "1" in B 1756 $Nb = $FingerprintsBitVectorB->GetNumOfSetBits(); 1757 1758 # Number of bits set to "1" in both A and B 1759 my($NcBitVector); 1760 $NcBitVector = $FingerprintsBitVectorA & $FingerprintsBitVectorB; 1761 $Nc = $NcBitVector->GetNumOfSetBits(); 1762 1763 return ($Na, $Nb, $Nc); 1764 } 1765 1766 # Return a list of supported distance coefficients... 1767 # 1768 sub GetSupportedDistanceCoefficients () { 1769 1770 return @DistanceCoefficients; 1771 } 1772 1773 # Return a list of supported similarity coefficients... 1774 # 1775 sub GetSupportedSimilarityCoefficients () { 1776 1777 return @SimilarityCoefficients; 1778 } 1779 1780 # Return a list of supported distance and similarity coefficients... 1781 # 1782 sub GetSupportedDistanceAndSimilarityCoefficients () { 1783 my(@DistanceAndSimilarityCoefficients); 1784 1785 @DistanceAndSimilarityCoefficients = (); 1786 push @DistanceAndSimilarityCoefficients, @DistanceCoefficients; 1787 push @DistanceAndSimilarityCoefficients, @SimilarityCoefficients; 1788 1789 return sort @DistanceAndSimilarityCoefficients; 1790 } 1791 1792 # Is it a fingerprints vector object? 1793 sub IsFingerprintsVector ($) { 1794 my($Object) = @_; 1795 1796 return _IsFingerprintsVector($Object); 1797 } 1798 1799 # Is it a fingerprints vector object? 1800 sub _IsFingerprintsVector { 1801 my($Object) = @_; 1802 1803 return (Scalar::Util::blessed($Object) && $Object->isa($ClassName)) ? 1 : 0; 1804 } 1805 1806 # Return a string containing vector values... 1807 sub StringifyFingerprintsVector { 1808 my($This) = @_; 1809 my($FingerprintsVectorString); 1810 1811 # Set type, values and value IDs... 1812 my($NumOfValues, $ValuesString, $NumOfValueIDs, $ValueIDsString, $MaxValuesToStringify); 1813 1814 $NumOfValues = $This->GetNumOfValues(); 1815 $MaxValuesToStringify = 500; 1816 1817 if ($NumOfValues < $MaxValuesToStringify) { 1818 # Append all values... 1819 $ValuesString = $NumOfValues ? join ' ', @{$This->{Values}} : 'None'; 1820 } 1821 else { 1822 # Truncate values... 1823 my($Index, @Values); 1824 for $Index (0 .. ($MaxValuesToStringify - 1)) { 1825 push @Values, $This->{Values}[$Index]; 1826 } 1827 $ValuesString = join(' ', @Values) . " ..."; 1828 } 1829 1830 $NumOfValueIDs = $This->GetNumOfValueIDs(); 1831 if ($NumOfValueIDs < $MaxValuesToStringify) { 1832 # Append all valueIDs... 1833 $ValueIDsString = $NumOfValueIDs ? join ' ', @{$This->{ValueIDs}} : 'None'; 1834 } 1835 else { 1836 # Truncate value IDs... 1837 my($Index, @ValueIDs); 1838 @ValueIDs = (); 1839 for $Index (0 .. ($MaxValuesToStringify - 1)) { 1840 push @ValueIDs, $This->{ValueIDs}[$Index]; 1841 } 1842 $ValueIDsString = join(' ', @ValueIDs) . " ..."; 1843 } 1844 1845 $FingerprintsVectorString = "Type: $This->{Type}; NumOfValues: $NumOfValues"; 1846 if ($This->{Type} =~ /^(OrderedNumericalValues|NumericalValues)$/i) { 1847 my($NumOfNonZeroValues); 1848 $NumOfNonZeroValues = $This->GetNumOfNonZeroValues(); 1849 $FingerprintsVectorString .= "; NumOfNonZeroValues: $NumOfNonZeroValues"; 1850 } 1851 1852 # Append all the values and value IDs... 1853 if ($NumOfValues < $MaxValuesToStringify) { 1854 $FingerprintsVectorString .= "; Values: <$ValuesString>; NumOfValueIDs: $NumOfValueIDs; ValueIDs: <$ValueIDsString>"; 1855 } 1856 else { 1857 $FingerprintsVectorString .= "; Values (Truncated after $MaxValuesToStringify): <$ValuesString>; NumOfValueIDs: $NumOfValueIDs; ValueIDs (Truncated after $MaxValuesToStringify): <$ValueIDsString>"; 1858 } 1859 1860 return $FingerprintsVectorString; 1861 } 1862