1 package Fingerprints::PathLengthFingerprints; 2 # 3 # File: PathLengthFingerprints.pm 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2025 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use Carp; 28 use Exporter; 29 use TextUtil (); 30 use MathUtil (); 31 use Fingerprints::Fingerprints; 32 use Molecule; 33 use AtomTypes::AtomicInvariantsAtomTypes; 34 use AtomTypes::DREIDINGAtomTypes; 35 use AtomTypes::EStateAtomTypes; 36 use AtomTypes::FunctionalClassAtomTypes; 37 use AtomTypes::MMFF94AtomTypes; 38 use AtomTypes::SLogPAtomTypes; 39 use AtomTypes::SYBYLAtomTypes; 40 use AtomTypes::TPSAAtomTypes; 41 use AtomTypes::UFFAtomTypes; 42 43 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 44 45 @ISA = qw(Fingerprints::Fingerprints Exporter); 46 @EXPORT = qw(); 47 @EXPORT_OK = qw(); 48 49 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 50 51 # Setup class variables... 52 my($ClassName); 53 _InitializeClass(); 54 55 # Overload Perl functions... 56 use overload '""' => 'StringifyPathLengthFingerprints'; 57 58 # Class constructor... 59 sub new { 60 my($Class, %NamesAndValues) = @_; 61 62 # Initialize object... 63 my $This = $Class->SUPER::new(); 64 bless $This, ref($Class) || $Class; 65 $This->_InitializePathLengthFingerprints(); 66 67 $This->_InitializePathLengthFingerprintsProperties(%NamesAndValues); 68 69 return $This; 70 } 71 72 # Initialize object data... 73 # 74 sub _InitializePathLengthFingerprints { 75 my($This) = @_; 76 77 # Type of fingerprint to generate... 78 # 79 # PathLengthBits - A bit vector indicating presence/absence of atom paths 80 # PathLengthCount - A vector containing count of atom paths 81 # 82 $This->{Type} = ''; 83 84 # Type of vector: FingerprintsBitVector or FingerprintsVector 85 $This->{VectorType} = ''; 86 87 # Set default mininum, maximum, and default size. Although any arbitrary size can 88 # be specified, bit vector used to store bits work on a vector size which is 89 # power of 2 and additonal bits are automatically added and cleared. 90 # 91 $This->{Size} = 1024; 92 93 $This->{MinSize} = 32; 94 $This->{MaxSize} = 2**32; 95 96 # Minimum and maximum path lengths to use for fingerprints generation... 97 $This->{MinLength} = 1; 98 $This->{MaxLength} = 8; 99 100 # Numner of bits to set for each atom path for FingerprintsBitVector... 101 $This->{NumOfBitsToSetPerPath} = 1; 102 103 # Atom identifier type to use for path atoms during fingerprints generation... 104 # 105 # Currently supported values are: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, 106 # EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, 107 # SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes 108 # 109 $This->{AtomIdentifierType} = ''; 110 111 # Atom types assigned to atoms... 112 %{$This->{AssignedAtomTypes}} = (); 113 114 # For molecules containing rings, atom paths starting from each atom can be traversed in four 115 # different ways: 116 # 117 # . Atom paths without any rings and sharing of bonds in traversed paths. 118 # . Atom paths containing rings and without any sharing of bonds in traversed paths 119 # . All possible atom paths without any rings and sharing of bonds in traversed paths 120 # . All possible atom paths containing rings and with sharing of bonds in traversed paths. 121 # 122 # Atom path traversal is terminated at the last ring atom. For molecules containing no rings, 123 # first two and last two types described above are equivalent. 124 # 125 # AllowSharedBonds and AllowRings variables allow generation of differen types of paths 126 # to be used for fingerprints generation. 127 # 128 # In addition to atom symbols, bond symbols are also used to generate a string 129 # for atom paths. These atom paths strings are hased to a 32 bit integer key which 130 # in turn is used as a seed for a random number generation in range of 1 to fingerprint 131 # size for setting corresponding bit in bit vector. 132 # 133 # UseBondSymbols variable allow generation of atom path strings and consequently fingerprints. 134 # 135 # Combination of AllowSharedBonds, AllowRings, and UseBondSymbols allow generation of 136 # 8 different types of path length fingerprints: 137 # 138 # AllowSharedBonds AllowRings UseBondSymbols PathLengthFingerprintsType 139 # 140 # No No Yes AtomPathsNoCyclesWithBondSymbols 141 # No Yes Yes AtomPathsWithCyclesWithBondSymbols 142 # 143 # Yes No Yes AllAtomPathsNoCyclesWithBondSymbols 144 # Yes Yes Yes AllAtomPathsWithCyclesWithBondSymbols [ DEFAULT ] 145 # 146 # No No No AtomPathsNoCyclesNoBondSymbols 147 # No Yes No AtomPathsWithCyclesNoBondSymbols 148 # 149 # Yes No No AllAtomPathsNoCyclesNoBondSymbols 150 # Yes Yes No AllAtomPathsWithCyclesNoWithBondSymbols 151 # 152 # 153 154 # By default, atom paths starting from atoms are allowed to share bonds already traversed... 155 $This->{AllowSharedBonds} = 1; 156 157 # By default rings are included in paths... 158 $This->{AllowRings} = 1; 159 160 # By default bond symbols are included in atom path strings... 161 $This->{UseBondSymbols} = 1; 162 163 # By default only structurally unique atom paths are used for generation 164 # atom path strings... 165 $This->{UseUniquePaths} = 1; 166 167 # Random number generator to use during generation of fingerprints bit-vector 168 # string: Perl CORE::rand or MayaChemTools MathUtil::random function. 169 # 170 # The random number generator implemented in MayaChemTools is a variant of 171 # linear congruential generator (LCG) as described by Miller et al. [ Ref 120 ]. 172 # It is also referred to as Lehmer random number generator or Park-Miller 173 # random number generator. 174 # 175 # Unlike Perl's core random number generator function rand, the random number 176 # generator implemented in MayaChemTools, MathUtil::random, generates consistent 177 # random values across different platformsfor a specific random seed and leads 178 # to generation of portable fingerprints bit-vector strings. 179 # 180 $This->{UsePerlCoreRandom} = 1; 181 182 # Bond symbols to use during generation of atom path strings... 183 %{$This->{BondOrderToSymbol}} = (); 184 %{$This->{BondOrderToSymbol}} = ('1' => '', '1.5' => ':', '2' => '=', '3' => '#'); 185 186 # BondSymbols map to use for bonded atom IDs to use during atom path strings... 187 %{$This->{BondSymbols}} = (); 188 189 # Path atom IDs to remove duplicate paths... 190 %{$This->{UniqueLinearAtomPathsIDs}} = (); 191 %{$This->{UniqueCyclicAtomPathsIDs}} = (); 192 193 # Reference to all the atom paths upto specified path length... 194 $This->{AtomPathsRef} = ''; 195 196 # Atom paths strings created using specified atom types and bond symbols... 197 %{$This->{AtomPathsStrings}} = (); 198 } 199 200 # Initialize class ... 201 sub _InitializeClass { 202 #Class name... 203 $ClassName = __PACKAGE__; 204 } 205 206 # Initialize object properties.... 207 sub _InitializePathLengthFingerprintsProperties { 208 my($This, %NamesAndValues) = @_; 209 210 my($Name, $Value, $MethodName); 211 while (($Name, $Value) = each %NamesAndValues) { 212 $MethodName = "Set${Name}"; 213 $This->$MethodName($Value); 214 } 215 216 # Make sure molecule object was specified... 217 if (!exists $NamesAndValues{Molecule}) { 218 croak "Error: ${ClassName}->New: Object can't be instantiated without specifying molecule..."; 219 } 220 221 if (!exists $NamesAndValues{Type}) { 222 croak "Error: ${ClassName}->New: Object can't be instantiated without specifying Type..."; 223 } 224 225 if (!exists $NamesAndValues{AtomIdentifierType}) { 226 croak "Error: ${ClassName}->New: Object can't be instantiated without specifying AtomIdentifierType..."; 227 } 228 229 # Make sure it's power of 2... 230 if (exists $NamesAndValues{Size}) { 231 if (!TextUtil::IsNumberPowerOfNumber($NamesAndValues{Size}, 2)) { 232 croak "Error: ${ClassName}->New: Specified size value, $NamesAndValues{Size}, must be power of 2..."; 233 } 234 } 235 236 if ($This->{Type} =~ /^PathLengthBits$/i) { 237 $This->_InitializePathLengthBits(); 238 } 239 elsif ($This->{Type} =~ /^PathLengthCount$/i) { 240 $This->_InitializePathLengthCount(); 241 } 242 else { 243 croak "Error: ${ClassName}->_InitializePathLengthFingerprintsProperties: Unknown PathLength type: $This->{Type}; Supported PathLength type : PathLengthBits or PathLengthCount......"; 244 } 245 246 return $This; 247 } 248 249 # Initialize PathLength bits... 250 # 251 sub _InitializePathLengthBits { 252 my($This) = @_; 253 254 # Vector type... 255 $This->{VectorType} = 'FingerprintsBitVector'; 256 257 $This->_InitializeFingerprintsBitVector(); 258 259 return $This; 260 } 261 262 # Initialize PathLength key count... 263 # 264 sub _InitializePathLengthCount { 265 my($This) = @_; 266 267 # Vector type and type of values... 268 $This->{VectorType} = 'FingerprintsVector'; 269 $This->{FingerprintsVectorType} = 'NumericalValues'; 270 271 $This->_InitializeFingerprintsVector(); 272 273 return $This; 274 } 275 276 # Set type... 277 # 278 sub SetType { 279 my($This, $Type) = @_; 280 281 if ($This->{Type}) { 282 croak "Error: ${ClassName}->SetType: Can't change type: It's already set..."; 283 } 284 285 if ($Type =~ /^PathLengthBits$/i) { 286 $This->{Type} = 'PathLengthBits';; 287 } 288 elsif ($Type =~ /^PathLengthCount$/i) { 289 $This->{Type} = 'PathLengthCount';; 290 } 291 else { 292 croak "Error: ${ClassName}->SetType: Unknown PathLength keys: $Type; Supported PathLength types: PathLengthBits or PathLengthCount..."; 293 } 294 return $This; 295 } 296 297 # Disable vector type change... 298 # 299 sub SetVectorType { 300 my($This, $Type) = @_; 301 302 croak "Error: ${ClassName}->SetVectorType: Can't change vector type..."; 303 304 return $This; 305 } 306 307 # Disable vector type change... 308 # 309 sub SetFingerprintsVectorType { 310 my($This, $Type) = @_; 311 312 croak "Error: ${ClassName}->SetFingerprintsVectorType: Can't change fingerprints vector type..."; 313 314 return $This; 315 } 316 317 # Set atom identifier type to use for path length atom identifiers... 318 # 319 sub SetAtomIdentifierType { 320 my($This, $IdentifierType) = @_; 321 322 if ($IdentifierType !~ /^(AtomicInvariantsAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|FunctionalClassAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 323 croak "Error: ${ClassName}->SetAtomIdentifierType: Specified value, $IdentifierType, for AtomIdentifierType is not vaild. Supported types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, and UFFAtomTypes."; 324 } 325 326 if ($This->{AtomIdentifierType}) { 327 croak "Error: ${ClassName}->SetAtomIdentifierType: Can't change atom identifier type: It's already set..."; 328 } 329 330 $This->{AtomIdentifierType} = $IdentifierType; 331 332 # Initialize atom identifier type information... 333 $This->_InitializeAtomIdentifierTypeInformation(); 334 335 return $This; 336 } 337 338 # Set minimum path length... 339 # 340 sub SetMinLength { 341 my($This, $Value) = @_; 342 343 if (!TextUtil::IsPositiveInteger($Value)) { 344 croak "Error: ${ClassName}->SetMinLength: MinLength value, $Value, is not valid: It must be a positive integer..."; 345 } 346 $This->{MinLength} = $Value; 347 348 return $This; 349 } 350 351 # Set maximum path length... 352 # 353 sub SetMaxLength { 354 my($This, $Value) = @_; 355 356 if (!TextUtil::IsPositiveInteger($Value)) { 357 croak "Error: ${ClassName}->SetMaxLength: MaxLength value, $Value, is not valid: It must be a positive integer..."; 358 } 359 $This->{MaxLength} = $Value; 360 361 return $This; 362 } 363 364 # Set number of bits to set for each path... 365 # 366 sub SetNumOfBitsToSetPerPath { 367 my($This, $Value) = @_; 368 369 if (!TextUtil::IsPositiveInteger($Value)) { 370 croak "Error: ${ClassName}->SetNumOfBitsToSetPerPath: NumOfBitsToSetPerPath value, $Value, is not valid: It must be a positive integer..."; 371 } 372 $This->{NumOfBitsToSetPerPath} = $Value; 373 374 return $This; 375 } 376 377 # Generate fingerprints description... 378 # 379 sub GetDescription { 380 my($This) = @_; 381 382 # Is description explicity set? 383 if (exists $This->{Description}) { 384 return $This->{Description}; 385 } 386 387 # Generate fingerprints description... 388 389 return "$This->{Type}:$This->{AtomIdentifierType}:MinLength$This->{MinLength}:MaxLength$This->{MaxLength}"; 390 } 391 392 # Generate path length fingerprints... 393 # 394 sub GenerateFingerprints { 395 my($This) = @_; 396 397 if ($This->{MinLength} > $This->{MaxLength}) { 398 croak "Error: ${ClassName}->GenerateFingerprints: No fingerpritns generated: MinLength, $This->{MinLength}, must be <= MaxLength, $This->{MaxLength}..."; 399 } 400 401 # Cache appropriate molecule data... 402 $This->_SetupMoleculeDataCache(); 403 404 # Assign atom types to all atoms... 405 if (!$This->_AssignAtomTypes()) { 406 carp "Warning: ${ClassName}->GenerateFingerprints: $This->{AtomIdentifierType} fingerprints generation didn't succeed: Couldn't assign valid $This->{AtomIdentifierType} to all atoms..."; 407 return $This; 408 } 409 410 # Setup bond symbol map... 411 if ($This->{UseBondSymbols}) { 412 $This->_InitializeBondSymbols(); 413 } 414 415 # Generate appropriate atom paths... 416 $This->_GenerateAtomPathsUpToMaxLength(); 417 418 # Initialize atom path strings... 419 $This->_InitializeAtomPathsStrings(); 420 421 # Generate appropriate atom path strings for unique atom paths... 422 $This->_GenerateAtomPathsStrings(); 423 424 # Set final fingerprints... 425 $This->_SetFinalFingerprints(); 426 427 # Clear cached molecule data... 428 $This->_ClearMoleculeDataCache(); 429 430 return $This; 431 } 432 433 # Assign appropriate atom types to all atoms... 434 # 435 sub _AssignAtomTypes { 436 my($This) = @_; 437 my($SpecifiedAtomTypes, $Atom, $AtomID, $IgnoreHydrogens); 438 439 %{$This->{AssignedAtomTypes}} = (); 440 $IgnoreHydrogens = 0; 441 442 $SpecifiedAtomTypes = undef; 443 444 IDENTIFIERTYPE: { 445 if ($This->{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) { 446 $SpecifiedAtomTypes = new AtomTypes::AtomicInvariantsAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens, 'AtomicInvariantsToUse' => $This->{AtomicInvariantsToUse}); 447 last IDENTIFIERTYPE; 448 } 449 450 if ($This->{AtomIdentifierType} =~ /^DREIDINGAtomTypes$/i) { 451 $SpecifiedAtomTypes = new AtomTypes::DREIDINGAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens); 452 last IDENTIFIERTYPE; 453 } 454 455 if ($This->{AtomIdentifierType} =~ /^EStateAtomTypes$/i) { 456 $SpecifiedAtomTypes = new AtomTypes::EStateAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens); 457 last IDENTIFIERTYPE; 458 } 459 460 if ($This->{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) { 461 $SpecifiedAtomTypes = new AtomTypes::FunctionalClassAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens, 'FunctionalClassesToUse' => $This->{FunctionalClassesToUse}); 462 last IDENTIFIERTYPE; 463 } 464 465 if ($This->{AtomIdentifierType} =~ /^MMFF94AtomTypes$/i) { 466 $SpecifiedAtomTypes = new AtomTypes::MMFF94AtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens); 467 last IDENTIFIERTYPE; 468 } 469 470 if ($This->{AtomIdentifierType} =~ /^SLogPAtomTypes$/i) { 471 $SpecifiedAtomTypes = new AtomTypes::SLogPAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens); 472 last IDENTIFIERTYPE; 473 } 474 if ($This->{AtomIdentifierType} =~ /^SYBYLAtomTypes$/i) { 475 $SpecifiedAtomTypes = new AtomTypes::SYBYLAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens); 476 last IDENTIFIERTYPE; 477 } 478 479 if ($This->{AtomIdentifierType} =~ /^TPSAAtomTypes$/i) { 480 $SpecifiedAtomTypes = new AtomTypes::TPSAAtomTypes('Molecule' => $This->{Molecule}, 'IgnorePhosphorus' => 0, 'IgnoreSulfur' => 0); 481 last IDENTIFIERTYPE; 482 } 483 484 if ($This->{AtomIdentifierType} =~ /^UFFAtomTypes$/i) { 485 $SpecifiedAtomTypes = new AtomTypes::UFFAtomTypes('Molecule' => $This->{Molecule}, 'IgnoreHydrogens' => $IgnoreHydrogens); 486 last IDENTIFIERTYPE; 487 } 488 489 croak "Error: ${ClassName}->_AssignAtomTypes: Unknown atom indentifier type $This->{AtomIdentifierType}..."; 490 } 491 492 # Assign atom types... 493 $SpecifiedAtomTypes->AssignAtomTypes(); 494 495 # Make sure atom types assignment is successful... 496 if (!$SpecifiedAtomTypes->IsAtomTypesAssignmentSuccessful()) { 497 return undef; 498 } 499 500 # Collect assigned atom types... 501 ATOM: for $Atom (@{$This->{Atoms}}) { 502 $AtomID = $Atom->GetID(); 503 $This->{AssignedAtomTypes}{$AtomID} = $SpecifiedAtomTypes->GetAtomType($Atom); 504 } 505 506 return $This; 507 } 508 509 # Setup bond symbol map for atoms to speed up generation of path length identifiers 510 # during fingerprints generation... 511 # 512 sub _InitializeBondSymbols { 513 my($This) = @_; 514 my($Atom1, $Atom2, $AtomID1, $AtomID2, $Bond, $BondSymbol, $BondOrder); 515 516 %{$This->{BondSymbols}} = (); 517 518 if (!$This->{UseBondSymbols}) { 519 return $This; 520 } 521 522 for $Bond ($This->{Molecule}->GetBonds()) { 523 $BondOrder = $Bond->GetBondOrder(); 524 $BondSymbol = $Bond->IsAromatic() ? ':' : (exists($This->{BondOrderToSymbol}{$BondOrder}) ? $This->{BondOrderToSymbol}{$BondOrder} : $BondOrder); 525 ($Atom1, $Atom2) = $Bond->GetAtoms(); 526 $AtomID1 = $Atom1->GetID(); $AtomID2 = $Atom2->GetID(); 527 if ($AtomID1 > $AtomID2) { 528 ($AtomID1, $AtomID2) = ($AtomID2, $AtomID1); 529 } 530 531 if (!exists $This->{BondSymbols}{$AtomID1}) { 532 %{$This->{BondSymbols}{$AtomID1}} = (); 533 } 534 $This->{BondSymbols}{$AtomID1}{$AtomID2} = $BondSymbol; 535 } 536 return $This; 537 } 538 539 # Get appropriate atom paths with length up to MaxLength... 540 # 541 sub _GenerateAtomPathsUpToMaxLength { 542 my($This) = @_; 543 my($PathLength, $AllowRings, $Molecule, $AtomPathsRef); 544 545 $PathLength = $This->{MaxLength}; 546 $AllowRings = $This->{AllowRings}; 547 $Molecule = $This->{Molecule}; 548 549 if ($This->{AllowSharedBonds}) { 550 $AtomPathsRef = $Molecule->GetAllAtomPathsWithLengthUpto($PathLength, $AllowRings); 551 } 552 else { 553 $AtomPathsRef = $Molecule->GetAtomPathsWithLengthUpto($PathLength, $AllowRings); 554 } 555 $This->{AtomPathsRef} = $AtomPathsRef; 556 557 return $This; 558 } 559 560 # Initialize atom paths strings at various pathlength levels... 561 # 562 sub _InitializeAtomPathsStrings { 563 my($This) = @_; 564 my($PathLength); 565 566 %{$This->{AtomPathsStrings}} = (); 567 568 for $PathLength ($This->{MinLength} .. $This->{MaxLength}) { 569 %{$This->{AtomPathsStrings}{$PathLength}} = (); 570 } 571 572 return $This; 573 } 574 575 # Generate appropriate atom path strings for unique atom paths... 576 # 577 sub _GenerateAtomPathsStrings { 578 my($This, $PathAtomsRef) = @_; 579 my($PathLength, $MinPathLength, $UseUniquePaths); 580 581 $MinPathLength = $This->{MinLength}; 582 $UseUniquePaths = $This->{UseUniquePaths}; 583 584 PATHATOMS: for $PathAtomsRef (@{$This->{AtomPathsRef}}) { 585 $PathLength = scalar @{$PathAtomsRef}; 586 if ($PathLength < $MinPathLength) { 587 next PATHATOMS; 588 } 589 if ($UseUniquePaths) { 590 $This->_GenerateAtomPathStringUsingUniquePath($PathAtomsRef); 591 } 592 else { 593 $This->_GenerateAtomPathString($PathAtomsRef); 594 } 595 } 596 return $This; 597 } 598 599 # Generate atom path string using unique path... 600 # 601 sub _GenerateAtomPathStringUsingUniquePath { 602 my($This, $PathAtomsRef) = @_; 603 604 if ($This->{AllowRings} && $This->_DoesAtomPathContainsCycle($PathAtomsRef)) { 605 $This->_GenerateAtomPathStringUsingUniquePathContainingCycle($PathAtomsRef); 606 } 607 else { 608 $This->_GenerateAtomPathStringUsingUniqueLinearPath($PathAtomsRef); 609 } 610 return $This; 611 } 612 613 # Generate atom path string for specified path containing no cycle... 614 # 615 sub _GenerateAtomPathStringUsingUniqueLinearPath { 616 my($This, $PathAtomsRef) = @_; 617 618 # Is it a unique linear atom path? 619 # 620 if (!$This->_IsUniqueLinearAtomPath($PathAtomsRef)) { 621 return $This; 622 } 623 $This->_GenerateAtomPathString($PathAtomsRef); 624 625 return $This; 626 } 627 628 # Is it a structurally unique linear path? 629 # 630 # For a path to be structurally unique, all of its atom IDs must be diffferent from any 631 # earlier path atom IDs. In order to generate atom path atom ID invariant of the atom 632 # order in the molecule, atom IDs are sorted numerically before generating the path ID. 633 # 634 # Notes: 635 # . Atom path ID doesn't reflect the order of atoms in the atom path. 636 # 637 sub _IsUniqueLinearAtomPath { 638 my($This, $PathAtomsRef) = @_; 639 my($AtomPathID, $PathLength, @PathAtomIDs); 640 641 @PathAtomIDs = (); 642 @PathAtomIDs = map { $_->GetID(); } @{$PathAtomsRef}; 643 644 $AtomPathID = join '-', sort { $a <=> $b } @PathAtomIDs; 645 if (exists $This->{UniqueLinearAtomPathsIDs}{$AtomPathID}) { 646 return 0; 647 } 648 649 # It's a unique atom path... 650 $This->{UniqueLinearAtomPathsIDs}{$AtomPathID} = 1; 651 652 return 1; 653 } 654 655 # Generate atom path string for specified path containing a cycle... 656 # 657 sub _GenerateAtomPathStringUsingUniquePathContainingCycle { 658 my($This, $PathAtomsRef) = @_; 659 660 # Is it a unique atom path containing a cycle? 661 # 662 if (!$This->_IsUniqueAtomPathContainingCycle($PathAtomsRef)) { 663 return $This; 664 } 665 666 my($CycleClosingPathAtomIndex); 667 ($CycleClosingPathAtomIndex) = $This->_GetAtomPathCycleClosingAtomIndex($PathAtomsRef); 668 669 if ($CycleClosingPathAtomIndex == 0) { 670 $This->_GenerateUniqueAtomPathStringForPathCycle($PathAtomsRef); 671 } 672 else { 673 $This->_GenerateUniqueAtomPathStringForPathContainingCycle($PathAtomsRef, $CycleClosingPathAtomIndex); 674 } 675 return $This; 676 } 677 678 # Generate a unique atom path string for a cyclic path by generating atom path 679 # strings for all possible paths in the cycle and keeping the lexicographically smallest 680 # one. 681 # 682 # Although all the paths enumerated during atom path string generation are also 683 # present in the intial paths list, but structural uniqueness check would detect 684 # 'em earlier and this method ends being invoked only once for the first cyclic path. 685 # 686 # For atom paths containg same atom types and bond symbols, atom path strings 687 # would be same for the paths. 688 # 689 sub _GenerateUniqueAtomPathStringForPathCycle { 690 my($This, $PathAtomsRef) = @_; 691 692 if ($This->_AreAllPathAtomsSymbolsSame($PathAtomsRef) && $This->_AreAllPathBondSymbolsSame($PathAtomsRef)) { 693 return $This->_GenerateAtomPathString($PathAtomsRef); 694 } 695 696 # Generate all possible atom path strings and select the lexicographically smallest one... 697 my($Index, $PathLength, $FinalAtomPathString, $FirstAtomPathString, $LastIndex, $FirstPartIndex, $FirstPartStartIndex, $FirstPartEndIndex, $SecondPartIndex, $SecondPartStartIndex, $SecondPartEndIndex, $AtomPathSymbolsRef, $AtomPathString, $ReverseAtomPathString, @FirstPartPathAtoms, @SecondPartPathAtoms, @PathAtoms); 698 699 $PathLength = scalar @{$PathAtomsRef}; 700 $LastIndex = $PathLength - 1; 701 702 $FinalAtomPathString = ''; 703 $FirstAtomPathString = 1; 704 705 @FirstPartPathAtoms = (); @SecondPartPathAtoms = (); @PathAtoms = (); 706 707 for $Index (0 .. ($LastIndex - 1)) { 708 @FirstPartPathAtoms = (); @SecondPartPathAtoms = (); @PathAtoms = (); 709 710 $FirstPartStartIndex = 0; $FirstPartEndIndex = $Index - 1; 711 $SecondPartStartIndex = $Index; $SecondPartEndIndex = $LastIndex - 1; 712 713 # Get first part atoms... 714 for $FirstPartIndex ($FirstPartStartIndex .. $FirstPartEndIndex) { 715 push @FirstPartPathAtoms, $PathAtomsRef->[$FirstPartIndex]; 716 } 717 718 # Get second part atoms... 719 for $SecondPartIndex ($SecondPartStartIndex .. $SecondPartEndIndex) { 720 push @SecondPartPathAtoms, $PathAtomsRef->[$SecondPartIndex]; 721 } 722 723 # Get final list of path atoms... 724 if (@SecondPartPathAtoms) { 725 push @PathAtoms, @SecondPartPathAtoms; 726 } 727 if (@FirstPartPathAtoms) { 728 push @PathAtoms, @FirstPartPathAtoms; 729 } 730 731 # Complete the cycle by adding first atom as the last atom... 732 push @PathAtoms, $PathAtomsRef->[$SecondPartStartIndex]; 733 734 # Generate atom path string... 735 $AtomPathSymbolsRef = $This->_GenerateAtomPathSymbols(\@PathAtoms); 736 737 $AtomPathString = join '', @{$AtomPathSymbolsRef}; 738 $ReverseAtomPathString = join '', reverse @{$AtomPathSymbolsRef}; 739 740 if ($ReverseAtomPathString le $AtomPathString) { 741 $AtomPathString = $ReverseAtomPathString; 742 } 743 744 # Update final atom path string... 745 746 if ($FirstAtomPathString) { 747 $FirstAtomPathString = 0; 748 $FinalAtomPathString = $AtomPathString; 749 } 750 else { 751 if ($AtomPathString le $FinalAtomPathString) { 752 $FinalAtomPathString = $AtomPathString; 753 } 754 } 755 } 756 757 # Set final atom path string... 758 # 759 if (exists $This->{AtomPathsStrings}{$PathLength}{$FinalAtomPathString}) { 760 $This->{AtomPathsStrings}{$PathLength}{$FinalAtomPathString} += 1; 761 } 762 else { 763 $This->{AtomPathsStrings}{$PathLength}{$FinalAtomPathString} = 1; 764 } 765 766 return $This; 767 } 768 769 # 770 # Generate a unique atom path string for paths containing a cycle closed by 771 # the specified atom index and the last atom index. 772 # 773 # The following methodology is used to generate atom path string which is 774 # independemt of initial atom ordering: 775 # . Generate atom paths string from first atom to the atom before the first cycle 776 # closing atom. 777 # . Generate atom path string from atoms from first cycle closing atom index to 778 # the last path atom in both forward and reverse order. And select the lexicographically 779 # smallest atom path string. 780 # . Combine atom path string generated in first step with second step to generate 781 # final atom path string. 782 # 783 sub _GenerateUniqueAtomPathStringForPathContainingCycle { 784 my($This, $PathAtomsRef, $CycleClosingAtomIndex) = @_; 785 my($Index, $PathLength, $LastIndex, $LinearPartStartIndex, $LinearPartEndIndex, $CyclicPartStartIndex, $CyclicPartEndIndex, $CyclicPartAtomPathSymbolsRef, $CyclicPartAtomPathString, $ReverseCyclicPartAtomPathString, $AtomPathString, $AtomPathSymbolsRef, @CyclicPartPathAtoms, @PathAtoms); 786 787 $PathLength = scalar @{$PathAtomsRef}; 788 $LastIndex = $PathLength - 1; 789 790 @PathAtoms = (); 791 792 # Get path atoms corresponding to linear part of the path... 793 $LinearPartStartIndex = 0; $LinearPartEndIndex = $CycleClosingAtomIndex - 1; 794 795 for $Index ($LinearPartStartIndex .. $LinearPartEndIndex) { 796 push @PathAtoms, $PathAtomsRef->[$Index]; 797 } 798 799 # Get atoms correcponding to cyclic part of the path... 800 @CyclicPartPathAtoms = (); 801 $CyclicPartStartIndex = $CycleClosingAtomIndex; $CyclicPartEndIndex = $LastIndex; 802 803 for $Index ($CyclicPartStartIndex .. $CyclicPartEndIndex) { 804 push @CyclicPartPathAtoms, $PathAtomsRef->[$Index]; 805 } 806 807 # Setup a lexicographically smaller atom path string for cyclic part... 808 809 $CyclicPartAtomPathSymbolsRef = $This->_GenerateAtomPathSymbols(\@CyclicPartPathAtoms); 810 $CyclicPartAtomPathString = join '', @{$CyclicPartAtomPathSymbolsRef}; 811 $ReverseCyclicPartAtomPathString = join '', reverse @{$CyclicPartAtomPathSymbolsRef}; 812 813 # Setup atom path corresponding to linear part and lexigraphicall smaller cyclic part... 814 815 if ($ReverseCyclicPartAtomPathString le $CyclicPartAtomPathString) { 816 push @PathAtoms, reverse @CyclicPartPathAtoms; 817 } 818 else { 819 push @PathAtoms, @CyclicPartPathAtoms; 820 } 821 822 # Setup final atom path string... 823 824 $AtomPathSymbolsRef = $This->_GenerateAtomPathSymbols(\@PathAtoms); 825 $AtomPathString = join '', @{$AtomPathSymbolsRef}; 826 827 if (exists $This->{AtomPathsStrings}{$PathLength}{$AtomPathString}) { 828 $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} += 1; 829 } 830 else { 831 $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} = 1; 832 } 833 834 return $This; 835 } 836 837 # Does atom path contain a cycle? 838 # 839 # For an atom path to contain cycle, it must satisfy the following conditions: 840 # . Pathlength >= 3 841 # . Last atom ID is equal to first atom ID or some other atom ID besides itself 842 # 843 sub _DoesAtomPathContainsCycle { 844 my($This, $PathAtomsRef) = @_; 845 my($PathLength); 846 847 $PathLength = scalar @{$PathAtomsRef}; 848 if ($PathLength <= 2) { 849 return 0; 850 } 851 852 my($AtomIndex, $LastAtomIndex, $Atom, $AtomID, $LastAtom, $LastAtomID); 853 854 $LastAtomIndex = $PathLength - 1; 855 $LastAtom = $PathAtomsRef->[$LastAtomIndex]; 856 $LastAtomID = $LastAtom->GetID(); 857 858 # Look for atomID similar to last atom ID... 859 for $AtomIndex (0 .. ($LastAtomIndex - 1)) { 860 $Atom = $PathAtomsRef->[$AtomIndex]; 861 $AtomID = $Atom->GetID(); 862 863 if ($AtomID == $LastAtomID) { 864 # It's a cycle... 865 return 1; 866 } 867 } 868 return 0; 869 } 870 871 # Get atom path cycle closing atom index... 872 # 873 sub _GetAtomPathCycleClosingAtomIndex { 874 my($This, $PathAtomsRef) = @_; 875 my($AtomIndex, $LastAtomIndex, $Atom, $AtomID, $LastAtom, $LastAtomID, $PathLength); 876 877 $PathLength = scalar @{$PathAtomsRef}; 878 879 $LastAtomIndex = $PathLength - 1; 880 $LastAtom = $PathAtomsRef->[$LastAtomIndex]; $LastAtomID = $LastAtom->GetID(); 881 882 # Look for atomID similar to last atom ID... 883 for $AtomIndex (0 .. ($LastAtomIndex - 1)) { 884 $Atom = $PathAtomsRef->[$AtomIndex]; $AtomID = $Atom->GetID(); 885 886 if ($AtomID == $LastAtomID) { 887 # It's a cycle closing atom... 888 return $AtomIndex; 889 } 890 } 891 return undef; 892 } 893 894 # Is it a structurally unique path containing a cycle? 895 # 896 # For atom paths containing cycles, last atom ID is either equal to first atom ID or 897 # some other atom ID besides itself. 898 # 899 # In order to determine its structurally unqiue independent of initial atom ordering, 900 # the following methodolgy is used: 901 # 902 # . For paths with same first and atom IDs: 903 # . Remove the last atom ID from atom path 904 # . Sort atom IDs in the path 905 # . Add first atom ID from the sorted list to the end of list to complete the cycle 906 # . Generate a atom path ID 907 # . Use final path ID to track uniqueness of path containing cycle. 908 # 909 # . For paths with last atom ID equal to some other atom ID besidies itself: 910 # . Sort atom IDs in atom path 911 # . Generate atom path ID and use it to track unqiueness of atom paths. 912 # 913 sub _IsUniqueAtomPathContainingCycle { 914 my($This, $PathAtomsRef) = @_; 915 my($PathLength, $AtomPathID, $FirstAtom, $LastAtom, $FirstAtomID, $LastAtomID, @PathAtomIDs, @SortedPathAtomIDs); 916 917 @PathAtomIDs = (); 918 @PathAtomIDs = map { $_->GetID(); } @{$PathAtomsRef}; 919 920 $PathLength = scalar @{$PathAtomsRef}; 921 922 $FirstAtom = $PathAtomsRef->[0]; $FirstAtomID = $FirstAtom->GetID(); 923 $LastAtom = $PathAtomsRef->[$PathLength - 1]; $LastAtomID = $LastAtom->GetID(); 924 925 if ($FirstAtomID == $LastAtomID) { 926 pop @PathAtomIDs; 927 928 @SortedPathAtomIDs = (); 929 @SortedPathAtomIDs = sort { $a <=> $b } @PathAtomIDs; 930 931 push @SortedPathAtomIDs, $SortedPathAtomIDs[0]; 932 933 $AtomPathID = join '-', @SortedPathAtomIDs; 934 } 935 else { 936 $AtomPathID = join '-', sort { $a <=> $b } @PathAtomIDs; 937 } 938 939 if (exists $This->{UniqueCyclicAtomPathsIDs}{$AtomPathID}) { 940 return 0; 941 } 942 943 # It's a unique atom path containing a cycle... 944 $This->{UniqueCyclicAtomPathsIDs}{$AtomPathID} = 1; 945 946 return 1; 947 } 948 949 # Generate atom path string for specified atom path... 950 # 951 sub _GenerateAtomPathString { 952 my($This, $PathAtomsRef) = @_; 953 my($PathLength, $AtomPathString, $ReverseAtomPathString, $AtomPathSymbolsRef); 954 955 $PathLength = scalar @{$PathAtomsRef}; 956 957 # Generate path atom and bond symbols... 958 # 959 $AtomPathSymbolsRef = $This->_GenerateAtomPathSymbols($PathAtomsRef); 960 961 # Check presence of path using path ID created by atom path symbols... 962 $AtomPathString = join '', @{$AtomPathSymbolsRef}; 963 if (exists $This->{AtomPathsStrings}{$PathLength}{$AtomPathString}) { 964 $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} += 1; 965 return $This; 966 } 967 968 # Check presence of reverse path using path ID created by atom path symbols... 969 # 970 $ReverseAtomPathString = join '', reverse @{$AtomPathSymbolsRef}; 971 if (exists $This->{AtomPathsStrings}{$PathLength}{$ReverseAtomPathString}) { 972 $This->{AtomPathsStrings}{$PathLength}{$ReverseAtomPathString} += 1; 973 return $This; 974 } 975 976 # Use lexicographically smaller atom path string as PathID... 977 # 978 if ($AtomPathString le $ReverseAtomPathString) { 979 $This->{AtomPathsStrings}{$PathLength}{$AtomPathString} = 1; 980 } 981 else { 982 $This->{AtomPathsStrings}{$PathLength}{$ReverseAtomPathString} = 1; 983 } 984 return $This; 985 } 986 987 # Are atom types for all path atoms same? 988 # 989 sub _AreAllPathAtomsSymbolsSame { 990 my($This, $PathAtomsRef) = @_; 991 my($Index, $Atom, $AtomID, $AtomType, $FirstAtomType); 992 993 $Atom = $PathAtomsRef->[0]; $AtomID = $Atom->GetID(); 994 $FirstAtomType = $This->{AssignedAtomTypes}{$AtomID}; 995 996 for $Index (1 .. $#{$PathAtomsRef}) { 997 $Atom = $PathAtomsRef->[$Index]; $AtomID = $Atom->GetID(); 998 $AtomType = $This->{AssignedAtomTypes}{$AtomID}; 999 1000 if ($AtomType ne $FirstAtomType) { 1001 return 0; 1002 } 1003 } 1004 return 1; 1005 } 1006 1007 # Are bond symbols for all path bonds same? 1008 # 1009 sub _AreAllPathBondSymbolsSame { 1010 my($This, $PathAtomsRef) = @_; 1011 my($Index, $Atom, $BondedAtom, $AtomID, $BondedAtomID, $BondAtomID1, $BondAtomID2, $FirstBondSymbol, $BondSymbol); 1012 1013 # During no usage of bond symbols, just ignore them and assume they are same... 1014 if (!$This->{UseBondSymbols}) { 1015 return 1; 1016 } 1017 1018 $Atom = $PathAtomsRef->[0]; $BondedAtom = $PathAtomsRef->[1]; 1019 $AtomID = $Atom->GetID(); $BondedAtomID = $BondedAtom->GetID(); 1020 1021 ($BondAtomID1, $BondAtomID2) = ($AtomID < $BondedAtomID) ? ($AtomID, $BondedAtomID) : ($BondedAtomID, $AtomID); 1022 $FirstBondSymbol = $This->{BondSymbols}{$BondAtomID1}{$BondAtomID2}; 1023 1024 for $Index (1 .. ($#{$PathAtomsRef} - 1)) { 1025 $Atom = $PathAtomsRef->[$Index]; $BondedAtom = $PathAtomsRef->[$Index + 1]; 1026 $AtomID = $Atom->GetID(); $BondedAtomID = $BondedAtom->GetID(); 1027 1028 ($BondAtomID1, $BondAtomID2) = ($AtomID < $BondedAtomID) ? ($AtomID, $BondedAtomID) : ($BondedAtomID, $AtomID); 1029 $BondSymbol = $This->{BondSymbols}{$BondAtomID1}{$BondAtomID2}; 1030 1031 if ($BondSymbol ne $FirstBondSymbol) { 1032 return 0; 1033 } 1034 } 1035 return 1; 1036 } 1037 1038 # Generate atom path symbols... 1039 # 1040 sub _GenerateAtomPathSymbols { 1041 my($This, $PathAtomsRef) = @_; 1042 my($Atom, $AtomID, @AtomPathSymbols); 1043 1044 @AtomPathSymbols = (); 1045 1046 if (@{$PathAtomsRef} == 1) { 1047 $Atom = $PathAtomsRef->[0]; $AtomID = $Atom->GetID(); 1048 push @AtomPathSymbols, $This->{AssignedAtomTypes}{$AtomID}; 1049 return \@AtomPathSymbols; 1050 } 1051 1052 # Ignore bond information... 1053 if (!$This->{UseBondSymbols}) { 1054 for $Atom (@{$PathAtomsRef}) { 1055 $AtomID = $Atom->GetID(); 1056 push @AtomPathSymbols, $This->{AssignedAtomTypes}{$AtomID}; 1057 } 1058 return \@AtomPathSymbols; 1059 } 1060 1061 # Use atoms and bonds to generate atom path string... 1062 my($Index, $BondedAtom, $BondedAtomID, $BondAtomID1, $BondAtomID2); 1063 1064 # Process atom type of first atom in path... 1065 $Atom = $PathAtomsRef->[0]; $AtomID = $Atom->GetID(); 1066 push @AtomPathSymbols, $This->{AssignedAtomTypes}{$AtomID}; 1067 1068 for $Index (0 .. ($#{$PathAtomsRef} - 1)) { 1069 $Atom = $PathAtomsRef->[$Index]; $BondedAtom = $PathAtomsRef->[$Index + 1]; 1070 $AtomID = $Atom->GetID(); $BondedAtomID = $BondedAtom->GetID(); 1071 1072 ($BondAtomID1, $BondAtomID2) = ($AtomID < $BondedAtomID) ? ($AtomID, $BondedAtomID) : ($BondedAtomID, $AtomID); 1073 push @AtomPathSymbols, $This->{BondSymbols}{$BondAtomID1}{$BondAtomID2}; 1074 1075 # Process atom type of next atom in path... 1076 push @AtomPathSymbols, $This->{AssignedAtomTypes}{$BondedAtomID}; 1077 } 1078 return \@AtomPathSymbols; 1079 } 1080 1081 # Set final fingerprits... 1082 # 1083 sub _SetFinalFingerprints { 1084 my($This) = @_; 1085 1086 # Mark successful generation of fingerprints... 1087 $This->{FingerprintsGenerated} = 1; 1088 1089 if ($This->{Type} =~ /^PathLengthBits$/i) { 1090 $This->_SetFinalFingerprintsBitVector(); 1091 } 1092 elsif ($This->{Type} =~ /^PathLengthCount$/i) { 1093 $This->_SetFinalFingerprintsVector(); 1094 } 1095 1096 return $This; 1097 } 1098 1099 # Set final fingerprits bit vector... 1100 # 1101 sub _SetFinalFingerprintsBitVector { 1102 my($This) = @_; 1103 my($PathLength, $Size, $AtomPathString, $AtomPathHashCode, $AtomPathBitPos, $FingerprintsBitVector, $SkipBitPosCheck, $NumOfBitsToSetPerPath, $SetBitNum); 1104 1105 $FingerprintsBitVector = $This->{FingerprintsBitVector}; 1106 1107 $Size = $This->{Size}; 1108 1109 $SkipBitPosCheck = 1; 1110 $NumOfBitsToSetPerPath = $This->{NumOfBitsToSetPerPath}; 1111 1112 for $PathLength (keys %{$This->{AtomPathsStrings}}) { 1113 for $AtomPathString (keys %{$This->{AtomPathsStrings}{$PathLength}}) { 1114 $AtomPathHashCode = TextUtil::HashCode($AtomPathString); 1115 1116 # Set random number seed... 1117 if ($This->{UsePerlCoreRandom}) { 1118 CORE::srand($AtomPathHashCode); 1119 } 1120 else { 1121 MathUtil::srandom($AtomPathHashCode); 1122 } 1123 1124 for $SetBitNum (1 .. $NumOfBitsToSetPerPath) { 1125 $AtomPathBitPos = $This->{UsePerlCoreRandom} ? int(CORE::rand($Size)) : int(MathUtil::random($Size)); 1126 $FingerprintsBitVector->SetBit($AtomPathBitPos, $SkipBitPosCheck); 1127 } 1128 } 1129 } 1130 return $This; 1131 } 1132 1133 # Set final fingerprits vector... 1134 # 1135 sub _SetFinalFingerprintsVector { 1136 my($This) = @_; 1137 my($PathLength, $AtomPathString, $FingerprintsVector, $AtomPathCount, @Values, @ValueIDs); 1138 1139 @Values = (); 1140 @ValueIDs = (); 1141 1142 for $PathLength (sort { $a <=> $b } keys %{$This->{AtomPathsStrings}}) { 1143 for $AtomPathString (sort keys %{$This->{AtomPathsStrings}{$PathLength}}) { 1144 $AtomPathCount = $This->{AtomPathsStrings}{$PathLength}{$AtomPathString}; 1145 1146 push @Values, $AtomPathCount; 1147 push @ValueIDs, $AtomPathString; 1148 } 1149 } 1150 1151 # Add PathLengthIDs and values to fingerprint vector... 1152 $This->{FingerprintsVector}->AddValueIDs(\@ValueIDs); 1153 $This->{FingerprintsVector}->AddValues(\@Values); 1154 1155 return $This; 1156 } 1157 1158 # Cache appropriate molecule data... 1159 # 1160 sub _SetupMoleculeDataCache { 1161 my($This) = @_; 1162 1163 # Get all atoms... 1164 @{$This->{Atoms}} = $This->GetMolecule()->GetAtoms(); 1165 1166 return $This; 1167 } 1168 1169 # Clear cached molecule data... 1170 # 1171 sub _ClearMoleculeDataCache { 1172 my($This) = @_; 1173 1174 # Clear atoms... 1175 @{$This->{Atoms}} = (); 1176 1177 # Clear path atoms.. 1178 $This->{AtomPathsRef} = ''; 1179 1180 return $This; 1181 } 1182 1183 # Set atomic invariants to use atom identifiers... 1184 # 1185 sub SetAtomicInvariantsToUse { 1186 my($This, @Values) = @_; 1187 my($FirstValue, $TypeOfFirstValue, $AtomicInvariant, $SpecifiedAtomicInvariant, $AtomicInvariantValue, @SpecifiedAtomicInvariants, @AtomicInvariantsToUse); 1188 1189 if (!@Values) { 1190 carp "Warning: ${ClassName}->SetAtomicInvariantsToUse: No values specified..."; 1191 return; 1192 } 1193 1194 $FirstValue = $Values[0]; 1195 $TypeOfFirstValue = ref $FirstValue; 1196 1197 @SpecifiedAtomicInvariants = (); 1198 @AtomicInvariantsToUse = (); 1199 1200 if ($TypeOfFirstValue =~ /^ARRAY/) { 1201 push @SpecifiedAtomicInvariants, @{$FirstValue}; 1202 } 1203 else { 1204 push @SpecifiedAtomicInvariants, @Values; 1205 } 1206 1207 # Make sure specified AtomicInvariants are valid... 1208 for $SpecifiedAtomicInvariant (@SpecifiedAtomicInvariants) { 1209 if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($SpecifiedAtomicInvariant)) { 1210 croak "Error: ${ClassName}->SetAtomicInvariantsToUse: Specified atomic invariant, $SpecifiedAtomicInvariant, is not supported...\n "; 1211 } 1212 $AtomicInvariant = $SpecifiedAtomicInvariant; 1213 push @AtomicInvariantsToUse, $AtomicInvariant; 1214 } 1215 1216 # Set atomic invariants to use... 1217 @{$This->{AtomicInvariantsToUse}} = (); 1218 push @{$This->{AtomicInvariantsToUse}}, @AtomicInvariantsToUse; 1219 1220 return $This; 1221 } 1222 1223 # Set functional classes to use for atom identifiers... 1224 # 1225 sub SetFunctionalClassesToUse { 1226 my($This, @Values) = @_; 1227 my($FirstValue, $TypeOfFirstValue, $FunctionalClass, $SpecifiedFunctionalClass, @SpecifiedFunctionalClasses, @FunctionalClassesToUse); 1228 1229 if (!@Values) { 1230 carp "Warning: ${ClassName}->SetFunctionalClassesToUse: No values specified..."; 1231 return; 1232 } 1233 1234 if ($This->{AtomIdentifierType} !~ /^FunctionalClassAtomTypes$/i) { 1235 carp "Warning: ${ClassName}->SetFunctionalClassesToUse: FunctionalClassesToUse can't be set for InitialAtomIdentifierType of $This->{AtomIdentifierType}..."; 1236 return; 1237 } 1238 1239 $FirstValue = $Values[0]; 1240 $TypeOfFirstValue = ref $FirstValue; 1241 1242 @SpecifiedFunctionalClasses = (); 1243 @FunctionalClassesToUse = (); 1244 1245 if ($TypeOfFirstValue =~ /^ARRAY/) { 1246 push @SpecifiedFunctionalClasses, @{$FirstValue}; 1247 } 1248 else { 1249 push @SpecifiedFunctionalClasses, @Values; 1250 } 1251 1252 # Make sure specified FunctionalClasses are valid... 1253 for $SpecifiedFunctionalClass (@SpecifiedFunctionalClasses) { 1254 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($SpecifiedFunctionalClass)) { 1255 croak "Error: ${ClassName}->SetFunctionalClassesToUse: Specified functional class, $SpecifiedFunctionalClass, is not supported...\n "; 1256 } 1257 push @FunctionalClassesToUse, $SpecifiedFunctionalClass; 1258 } 1259 1260 # Set functional classes to use... 1261 @{$This->{FunctionalClassesToUse}} = (); 1262 push @{$This->{FunctionalClassesToUse}}, @FunctionalClassesToUse; 1263 1264 return $This; 1265 } 1266 1267 # Initialize atom indentifier type information... 1268 # 1269 # Current supported values: 1270 # 1271 # AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, 1272 # MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes 1273 # 1274 sub _InitializeAtomIdentifierTypeInformation { 1275 my($This) = @_; 1276 1277 if ($This->{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) { 1278 $This->_InitializeAtomicInvariantsAtomTypesInformation(); 1279 } 1280 elsif ($This->{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) { 1281 $This->_InitializeFunctionalClassAtomTypesInformation(); 1282 } 1283 elsif ($This->{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { 1284 # Nothing to do for now... 1285 } 1286 else { 1287 croak "Error: ${ClassName}->_InitializeAtomIdentifierTypeInformation: Unknown atom indentifier type $This->{AtomIdentifierType}..."; 1288 } 1289 1290 return $This; 1291 } 1292 1293 # Initialize atomic invariants atom types to use for generating atom identifiers... 1294 # 1295 # Let: 1296 # AS = Atom symbol corresponding to element symbol 1297 # 1298 # X<n> = Number of non-hydrogen atom neighbors or heavy atoms attached to atom 1299 # BO<n> = Sum of bond orders to non-hydrogen atom neighbors or heavy atoms attached to atom 1300 # LBO<n> = Largest bond order of non-hydrogen atom neighbors or heavy atoms attached to atom 1301 # SB<n> = Number of single bonds to non-hydrogen atom neighbors or heavy atoms attached to atom 1302 # DB<n> = Number of double bonds to non-hydrogen atom neighbors or heavy atoms attached to atom 1303 # TB<n> = Number of triple bonds to non-hydrogen atom neighbors or heavy atoms attached to atom 1304 # H<n> = Number of implicit and explicit hydrogens for atom 1305 # Ar = Aromatic annotation indicating whether atom is aromatic 1306 # RA = Ring atom annotation indicating whether atom is a ring 1307 # FC<+n/-n> = Formal charge assigned to atom 1308 # MN<n> = Mass number indicating isotope other than most abundant isotope 1309 # SM<n> = Spin multiplicity of atom. Possible values: 1 (singlet), 2 (doublet) or 3 (triplet) 1310 # 1311 # Then: 1312 # 1313 # Atom type generated by AtomTypes::AtomicInvariantsAtomTypes class corresponds to: 1314 # 1315 # AS.X<n>.BO<n>.LBO<n>.<SB><n>.<DB><n>.<TB><n>.H<n>.Ar.RA.FC<+n/-n>.MN<n>.SM<n> 1316 # 1317 # Except for AS which is a required atomic invariant in atom types, all other atomic invariants are 1318 # optional. Default atomic invariants used for AtomID are: AS, X<n>, BO<n>, H<n>, FC<+n/-n>. 1319 # AtomID specification doesn't include atomic invariants with zero or undefined values. 1320 # 1321 sub _InitializeAtomicInvariantsAtomTypesInformation { 1322 my($This) = @_; 1323 1324 # Default atomic invariants to use for generating atom neighborhood atom IDs: AS, X, BO, H, FC 1325 # 1326 @{$This->{AtomicInvariantsToUse}} = (); 1327 @{$This->{AtomicInvariantsToUse}} = ('AS', 'X', 'BO', 'H', 'FC'); 1328 1329 return $This; 1330 } 1331 1332 # Initialize functional class atom types, generated by AtomTypes::FunctionalClassAtomTypes 1333 # class, to use for generating atom identifiers... 1334 # 1335 # Let: 1336 # HBD: HydrogenBondDonor 1337 # HBA: HydrogenBondAcceptor 1338 # PI : PositivelyIonizable 1339 # NI : NegativelyIonizable 1340 # Ar : Aromatic 1341 # Hal : Halogen 1342 # H : Hydrophobic 1343 # RA : RingAtom 1344 # CA : ChainAtom 1345 # 1346 # Then: 1347 # 1348 # Functiononal class atom type specification for an atom corresponds to: 1349 # 1350 # Ar.CA.H.HBA.HBD.Hal.NI.PI.RA 1351 # 1352 # Default functional classes used are: HBD, HBA, PI, NI, Ar, Hal 1353 # 1354 # FunctionalAtomTypes are assigned using the following definitions [ Ref 60-61, Ref 65-66 ]: 1355 # 1356 # HydrogenBondDonor: NH, NH2, OH 1357 # HydrogenBondAcceptor: N[!H], O 1358 # PositivelyIonizable: +, NH2 1359 # NegativelyIonizable: -, C(=O)OH, S(=O)OH, P(=O)OH 1360 # 1361 sub _InitializeFunctionalClassAtomTypesInformation { 1362 my($This) = @_; 1363 1364 # Default functional class atom typess to use for generating atom identifiers 1365 # are: HBD, HBA, PI, NI, Ar, Hal 1366 # 1367 @{$This->{FunctionalClassesToUse}} = (); 1368 @{$This->{FunctionalClassesToUse}} = ('HBD', 'HBA', 'PI', 'NI', 'Ar', 'Hal'); 1369 1370 return $This; 1371 } 1372 1373 # Return a string containg data for PathLengthFingerprints object... 1374 # 1375 sub StringifyPathLengthFingerprints { 1376 my($This) = @_; 1377 my($PathLengthsFingerprintsString); 1378 1379 # Type of fingerprint... 1380 $PathLengthsFingerprintsString = "Fingerprint type: $This->{Type}; AtomIdentifierType: $This->{AtomIdentifierType}"; 1381 1382 # Path length... 1383 $PathLengthsFingerprintsString .= "; MinPathLength: $This->{MinLength}; MaxPathLength: $This->{MaxLength}"; 1384 1385 # Fingerprint generation control... 1386 my($AllowSharedBonds, $AllowRings, $UseBondSymbols, $UseUniquePaths); 1387 1388 $AllowSharedBonds = $This->{AllowSharedBonds} ? "Yes" : "No"; 1389 $AllowRings = $This->{AllowRings} ? "Yes" : "No"; 1390 $UseBondSymbols = $This->{UseBondSymbols} ? "Yes" : "No"; 1391 $UseUniquePaths = $This->{UseBondSymbols} ? "Yes" : "No"; 1392 1393 $PathLengthsFingerprintsString .= "; UseUniquePaths: $UseUniquePaths; AllowSharedBonds: $AllowSharedBonds; AllowRings: $AllowRings; UseBondSymbols: $UseBondSymbols"; 1394 1395 if ($This->{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) { 1396 my($AtomicInvariant, @AtomicInvariants, @AtomicInvariantsOrder, %AvailableAtomicInvariants); 1397 1398 @AtomicInvariantsOrder = AtomTypes::AtomicInvariantsAtomTypes::GetAtomicInvariantsOrder(); 1399 %AvailableAtomicInvariants = AtomTypes::AtomicInvariantsAtomTypes::GetAvailableAtomicInvariants(); 1400 1401 for $AtomicInvariant (@AtomicInvariantsOrder) { 1402 push @AtomicInvariants, "$AtomicInvariant: $AvailableAtomicInvariants{$AtomicInvariant}"; 1403 } 1404 1405 $PathLengthsFingerprintsString .= "; AtomicInvariantsToUse: <" . TextUtil::JoinWords(\@{$This->{AtomicInvariantsToUse}}, ", ", 0) . ">"; 1406 $PathLengthsFingerprintsString .= "; AtomicInvariantsOrder: <" . TextUtil::JoinWords(\@AtomicInvariantsOrder, ", ", 0) . ">"; 1407 $PathLengthsFingerprintsString .= "; AvailableAtomicInvariants: <" . TextUtil::JoinWords(\@AtomicInvariants, ", ", 0) . ">"; 1408 } 1409 elsif ($This->{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) { 1410 my($FunctionalClass, @FunctionalClasses, @FunctionalClassesOrder, %AvailableFunctionalClasses); 1411 1412 @FunctionalClassesOrder = AtomTypes::FunctionalClassAtomTypes::GetFunctionalClassesOrder(); 1413 %AvailableFunctionalClasses = AtomTypes::FunctionalClassAtomTypes::GetAvailableFunctionalClasses(); 1414 1415 for $FunctionalClass (@FunctionalClassesOrder) { 1416 push @FunctionalClasses, "$FunctionalClass: $AvailableFunctionalClasses{$FunctionalClass}"; 1417 } 1418 1419 $PathLengthsFingerprintsString .= "; FunctionalClassesToUse: <" . TextUtil::JoinWords(\@{$This->{FunctionalClassesToUse}}, ", ", 0) . ">"; 1420 $PathLengthsFingerprintsString .= "; FunctionalClassesOrder: <" . TextUtil::JoinWords(\@FunctionalClassesOrder, ", ", 0) . ">"; 1421 $PathLengthsFingerprintsString .= "; AvailableFunctionalClasses: <" . TextUtil::JoinWords(\@FunctionalClasses, ", ", 0) . ">"; 1422 } 1423 1424 if ($This->{Type} =~ /^PathLengthBits$/i) { 1425 # Size... 1426 $PathLengthsFingerprintsString .= "; Size: $This->{Size}; MinSize: $This->{MinSize}; MaxSize: $This->{MaxSize}"; 1427 1428 # NumOfBitsToSetPerPath... 1429 $PathLengthsFingerprintsString .= "; NumOfBitsToSetPerPath: $This->{NumOfBitsToSetPerPath}"; 1430 1431 # Fingerprint bit density and num of bits set... 1432 my($NumOfSetBits, $BitDensity); 1433 $NumOfSetBits = $This->{FingerprintsBitVector}->GetNumOfSetBits(); 1434 $BitDensity = $This->{FingerprintsBitVector}->GetFingerprintsBitDensity(); 1435 $PathLengthsFingerprintsString .= "; NumOfOnBits: $NumOfSetBits; BitDensity: $BitDensity"; 1436 1437 $PathLengthsFingerprintsString .= "; FingerprintsBitVector: < $This->{FingerprintsBitVector} >"; 1438 } 1439 elsif ($This->{Type} =~ /^PathLengthCount$/i) { 1440 $PathLengthsFingerprintsString .= "; FingerprintsVector: < $This->{FingerprintsVector} >"; 1441 } 1442 1443 return $PathLengthsFingerprintsString; 1444 } 1445