1 package SDFileUtil; 2 # 3 # File: SDFileUtil.pm 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use Exporter; 28 use Carp; 29 use PeriodicTable qw(IsElement); 30 use TimeUtil (); 31 32 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 33 34 @ISA = qw(Exporter); 35 @EXPORT = qw(GenerateCmpdAtomLine GenerateCmpdBondLine GenerateCmpdChargePropertyLines GenerateCmpdCommentsLine GenerateCmpdCountsLine GenerateCmpdAtomAliasPropertyLines GenerateCmpdIsotopePropertyLines GenerateCmpdDataHeaderLabelsAndValuesLines GenerateCmpdMiscInfoLine GenerateCmpdRadicalPropertyLines GenerateCmpdMolNameLine GenerateEmptyCtabBlockLines GenerateMiscLineDateStamp GetAllAndCommonCmpdDataHeaderLabels GetCmpdDataHeaderLabels GetCmpdDataHeaderLabelsAndValues GetCmpdFragments GetCtabLinesCount GetUnknownAtoms GetInvalidAtomNumbers MDLChargeToInternalCharge InternalChargeToMDLCharge MDLBondTypeToInternalBondOrder InternalBondOrderToMDLBondType MDLBondStereoToInternalBondStereochemistry InternalBondStereochemistryToMDLBondStereo InternalSpinMultiplicityToMDLRadical MDLRadicalToInternalSpinMultiplicity IsCmpd3D IsCmpd2D ParseCmpdAtomLine ParseCmpdBondLine ParseCmpdCommentsLine ParseCmpdCountsLine ParseCmpdMiscInfoLine ParseCmpdMolNameLine ParseCmpdAtomAliasPropertyLine ParseCmpdChargePropertyLine ParseCmpdIsotopePropertyLine ParseCmpdRadicalPropertyLine ReadCmpdString RemoveCmpdDataHeaderLabelAndValue WashCmpd); 36 @EXPORT_OK = qw(); 37 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 38 39 # Format data for compounds count line... 40 sub GenerateCmpdCountsLine { 41 my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version, $Line); 42 43 if (@_ == 5) { 44 ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = @_; 45 } 46 elsif (@_ == 3) { 47 ($AtomCount, $BondCount, $ChiralFlag) = @_; 48 $PropertyCount = 999; 49 $Version = "V2000"; 50 } 51 else { 52 ($AtomCount, $BondCount) = @_; 53 $ChiralFlag = 0; 54 $PropertyCount = 999; 55 $Version = "V2000"; 56 } 57 if ($AtomCount > 999) { 58 croak "Error: SDFileUtil::GenerateCmpdCountsLine: The atom count, $AtomCount, exceeds maximum of 999 allowed for CTAB version 2000. The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools..."; 59 } 60 $Line = sprintf "%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%6s", $AtomCount, $BondCount, 0, 0, $ChiralFlag, 0, 0, 0, 0, 0, $PropertyCount, $Version; 61 62 return ($Line); 63 } 64 65 # Generate comments line... 66 sub GenerateCmpdCommentsLine { 67 my($Comments) = @_; 68 my($Line); 69 70 $Line = (length($Comments) > 80) ? substr($Comments, 0, 80) : $Comments; 71 72 return $Line; 73 } 74 75 # Generate molname line... 76 sub GenerateCmpdMolNameLine { 77 my($MolName) = @_; 78 my($Line); 79 80 $Line = (length($MolName) > 80) ? substr($MolName, 0, 80) : $MolName; 81 82 return $Line; 83 } 84 85 # Generate data for compounds misc info line... 86 sub GenerateCmpdMiscInfoLine { 87 my($ProgramName, $UserInitial, $Code) = @_; 88 my($Date, $Line); 89 90 if (!(defined($ProgramName) && $ProgramName)) { 91 $ProgramName = "MayaChem"; 92 } 93 if (!(defined($UserInitial) && $UserInitial)) { 94 $UserInitial = " "; 95 } 96 if (!(defined($Code) && $Code)) { 97 $Code = "2D"; 98 } 99 100 if (length($ProgramName) > 8) { 101 $ProgramName = substr($ProgramName, 0, 8); 102 } 103 if (length($UserInitial) > 2) { 104 $UserInitial = substr($UserInitial, 0, 2); 105 } 106 if (length($Code) > 2) { 107 $Code = substr($Code, 0, 2); 108 } 109 $Date = GenerateMiscLineDateStamp(); 110 111 $Line = "${UserInitial}${ProgramName}${Date}${Code}"; 112 113 return $Line; 114 } 115 116 # Generate data for compounds misc info line... 117 sub GenerateEmptyCtabBlockLines { 118 my($Date, $Lines); 119 120 if (@_ == 1) { 121 ($Date) = @_; 122 } 123 else { 124 $Date = GenerateMiscLineDateStamp(); 125 } 126 # First line: Blank molname line... 127 # Second line: Misc info... 128 # Third line: Blank comments line... 129 # Fourth line: Counts line reflecting empty structure data block... 130 $Lines = "\n"; 131 $Lines .= " MayaChem${Date}2D\n"; 132 $Lines .= "\n"; 133 $Lines .= GenerateCmpdCountsLine(0, 0, 0) . "\n"; 134 $Lines .= "M END"; 135 136 return $Lines; 137 } 138 139 # Generate SD file data stamp... 140 sub GenerateMiscLineDateStamp { 141 return TimeUtil::SDFileTimeStamp(); 142 } 143 144 # Generate data for compound atom line... 145 # 146 sub GenerateCmpdAtomLine { 147 my($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity) = @_; 148 my($Line); 149 150 if (!defined $MassDifference) { 151 $MassDifference = 0; 152 } 153 if (!defined $Charge) { 154 $Charge = 0; 155 } 156 if (!defined $StereoParity) { 157 $StereoParity = 0; 158 } 159 $Line = sprintf "%10.4f%10.4f%10.4f %-3s%2i%3i%3i 0 0 0 0 0 0 0 0 0", $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity; 160 161 return $Line 162 } 163 164 # Generate data for compound bond line... 165 # 166 sub GenerateCmpdBondLine { 167 my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = @_; 168 my($Line); 169 170 if (!defined $BondStereo) { 171 $BondStereo = 0; 172 } 173 $Line = sprintf "%3i%3i%3i%3i 0 0 0", $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo; 174 175 return $Line 176 } 177 178 # Generate charge property lines for CTAB block... 179 # 180 sub GenerateCmpdChargePropertyLines { 181 my($ChargeValuePairsRef) = @_; 182 183 return _GenerateCmpdGenericPropertyLines('Charge', $ChargeValuePairsRef); 184 } 185 186 # Generate isotope property lines for CTAB block... 187 # 188 sub GenerateCmpdIsotopePropertyLines { 189 my($IsotopeValuePairsRef) = @_; 190 191 return _GenerateCmpdGenericPropertyLines('Isotope', $IsotopeValuePairsRef); 192 } 193 194 # Generate radical property line property lines for CTAB block... 195 # 196 sub GenerateCmpdRadicalPropertyLines { 197 my($RadicalValuePairsRef) = @_; 198 199 return _GenerateCmpdGenericPropertyLines('Radical', $RadicalValuePairsRef); 200 } 201 202 # Generate atom alias property line property lines for CTAB block... 203 # 204 # Atom alias property line format: 205 # 206 # A aaa 207 # x... 208 # 209 # aaa: Atom number 210 # x: Atom alias in next line 211 # 212 sub GenerateCmpdAtomAliasPropertyLines { 213 my($PropertyValuePairsRef) = @_; 214 my($Index, $AtomNum, $AtomAlias, $Line, @PropertyLines); 215 216 @PropertyLines = (); 217 218 for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) { 219 $AtomNum = $PropertyValuePairsRef->[$Index]; 220 $AtomAlias = $PropertyValuePairsRef->[$Index + 1]; 221 222 $Line = "A " . sprintf "%3i", $AtomNum; 223 224 push @PropertyLines, $Line; 225 push @PropertyLines, $AtomAlias; 226 } 227 228 return @PropertyLines; 229 } 230 231 # Generate data header labels and values lines... 232 # 233 sub GenerateCmpdDataHeaderLabelsAndValuesLines { 234 my($DataHeaderLabelsRef, $DataHeaderLabelsAndValuesRef, $SortDataLabels) = @_; 235 my($DataLabel, $DataValue, @DataLabels, @DataLines); 236 237 if (!defined $SortDataLabels) { 238 $SortDataLabels = 0; 239 } 240 241 @DataLines = (); 242 @DataLabels = (); 243 if ($SortDataLabels) { 244 push @DataLabels, sort @{$DataHeaderLabelsRef}; 245 } 246 else { 247 push @DataLabels, @{$DataHeaderLabelsRef}; 248 } 249 for $DataLabel (@DataLabels) { 250 $DataValue = ''; 251 if (exists $DataHeaderLabelsAndValuesRef->{$DataLabel}) { 252 $DataValue = $DataHeaderLabelsAndValuesRef->{$DataLabel}; 253 } 254 push @DataLines, ("> <${DataLabel}>", "$DataValue", ""); 255 } 256 return @DataLines; 257 } 258 259 # Parse data field header in SD file and return lists of all and common data field 260 # labels. 261 sub GetAllAndCommonCmpdDataHeaderLabels { 262 my($SDFileRef) = @_; 263 my($CmpdCount, $CmpdString, $Label, @CmpdLines, @DataFieldLabels, @CommonDataFieldLabels, %DataFieldLabelsMap); 264 265 $CmpdCount = 0; 266 @DataFieldLabels = (); 267 @CommonDataFieldLabels = (); 268 %DataFieldLabelsMap = (); 269 270 while ($CmpdString = ReadCmpdString($SDFileRef)) { 271 $CmpdCount++; 272 @CmpdLines = split "\n", $CmpdString; 273 # Process compound data header labels and figure out which ones are present for 274 # all the compounds... 275 if (@DataFieldLabels) { 276 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines); 277 my(%CmpdDataFieldLabelsMap) = (); 278 # Setup a map for the current labels... 279 for $Label (@CmpdDataFieldLabels) { 280 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome"; 281 } 282 # Check the presence old labels for this compound; otherwise, mark 'em new... 283 for $Label (@DataFieldLabels) { 284 if (!$CmpdDataFieldLabelsMap{$Label}) { 285 $DataFieldLabelsMap{$Label} = "PresentInSome"; 286 } 287 } 288 # Check the presence this compound in the old labels; otherwise, add 'em... 289 for $Label (@CmpdDataFieldLabels ) { 290 if (!$DataFieldLabelsMap{$Label}) { 291 # It's a new label... 292 push @DataFieldLabels, $Label; 293 $DataFieldLabelsMap{$Label} = "PresentInSome"; 294 } 295 } 296 } 297 else { 298 # Get the initial label set and set up a map... 299 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 300 for $Label (@DataFieldLabels) { 301 $DataFieldLabelsMap{$Label} = "PresentInAll"; 302 } 303 } 304 } 305 # Identify the common data field labels... 306 @CommonDataFieldLabels = (); 307 for $Label (@DataFieldLabels) { 308 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") { 309 push @CommonDataFieldLabels, $Label; 310 } 311 } 312 return ($CmpdCount, \@DataFieldLabels, \@CommonDataFieldLabels); 313 } 314 315 # Parse all the data header labels and return 'em as an list... 316 # 317 # Format: 318 # 319 #> Data header line 320 #Data line(s) 321 #Blank line 322 # 323 # [Data Header] (one line) precedes each item of data, starts with a greater than (>) sign, and 324 # contains at least one of the following: 325 # The field name enclosed in angle brackets. For example: <melting.point> 326 # The field number, DTn , where n represents the number assigned to the field in a MACCS-II database 327 # 328 #Optional information for the data header includes: 329 # The compounds external and internal registry numbers. External registry numbers must be enclosed in parentheses. 330 # Any combination of information 331 # 332 #The following are examples of valid data headers: 333 #> <MELTING.POINT> 334 #> 55 (MD-08974) <BOILING.POINT> DT12 335 #> DT12 55 336 #> (MD-0894) <BOILING.POINT> FROM ARCHIVES 337 # 338 #Notes: Sometimes last blank line is missing and can be just followed by $$$$ 339 # 340 sub GetCmpdDataHeaderLabels { 341 my($CmpdLines) = @_; 342 my($CmpdLine, $Label, @Labels); 343 344 @Labels = (); 345 CMPDLINE: for $CmpdLine (@$CmpdLines) { 346 if ($CmpdLine !~ /^>/) { 347 next CMPDLINE; 348 } 349 # Does the line contains field name enclosed in angular brackets? 350 ($Label) = $CmpdLine =~ /<.*?>/g; 351 if (!defined($Label)) { 352 next CMPDLINE; 353 } 354 $Label =~ s/(<|>)//g; 355 push @Labels, $Label; 356 } 357 return (@Labels); 358 } 359 360 # Parse all the data header labels and values 361 sub GetCmpdDataHeaderLabelsAndValues { 362 my($CmpdLines) = @_; 363 my($CmpdLine, $CurrentLabel, $Label, $Value, $ValueCount, $ProcessingLabelData, @Values, %DataFields); 364 365 %DataFields = (); 366 $ProcessingLabelData = 0; 367 $ValueCount = 0; 368 CMPDLINE: for $CmpdLine (@$CmpdLines) { 369 if ($CmpdLine =~ /^\$\$\$\$/) { 370 last CMPDLINE; 371 } 372 if ($CmpdLine =~ /^>/) { 373 # Does the line contains field name enclosed in angular brackets? 374 ($Label) = $CmpdLine =~ /<.*?>/g; 375 if (defined $Label) { 376 $CurrentLabel = $Label; 377 $CurrentLabel =~ s/(<|>)//g; 378 $ProcessingLabelData = 0; 379 $ValueCount = 0; 380 381 if ($CurrentLabel) { 382 $ProcessingLabelData = 1; 383 $DataFields{$CurrentLabel} = ''; 384 next CMPDLINE; 385 } 386 } 387 else { 388 if (!$ProcessingLabelData) { 389 # Data line containing no <label> as allowed by SDF format. Just ignore it... 390 next CMPDLINE; 391 } 392 } 393 } 394 if (!$ProcessingLabelData) { 395 next CMPDLINE; 396 } 397 if (!(defined($CmpdLine) && length($CmpdLine))) { 398 # Blank line terminates value for a label... 399 $CurrentLabel = ''; 400 $ValueCount = 0; 401 $ProcessingLabelData = 0; 402 next CMPDLINE; 403 } 404 $ValueCount++; 405 $Value = $CmpdLine; 406 407 if ($ValueCount > 1) { 408 $DataFields{$CurrentLabel} .= "\n" . $Value; 409 } 410 else { 411 $DataFields{$CurrentLabel} = $Value; 412 } 413 } 414 return (%DataFields); 415 } 416 417 # Return an updated compoud string after removing data header label along with its 418 # value from the specified compound string... 419 # 420 sub RemoveCmpdDataHeaderLabelAndValue { 421 my($CmpdString, $DataHeaderLabel) = @_; 422 my($Line, $PorcessingDataHeaderLabel, @CmpdLines); 423 424 @CmpdLines = (); 425 $PorcessingDataHeaderLabel = 0; 426 427 CMPDLINE: for $Line (split "\n", $CmpdString) { 428 if ($Line =~ /^>/ && $Line =~ /<$DataHeaderLabel>/i) { 429 $PorcessingDataHeaderLabel = 1; 430 next CMPDLINE; 431 } 432 433 if ($PorcessingDataHeaderLabel) { 434 # Blank line indicates end of fingerprints data value... 435 if ($Line =~ /^\$\$\$\$/) { 436 push @CmpdLines, $Line; 437 $PorcessingDataHeaderLabel = 0; 438 } 439 elsif (!length($Line)) { 440 $PorcessingDataHeaderLabel = 0; 441 } 442 next CMPDLINE; 443 } 444 445 # Track compound lines without fingerprints data... 446 push @CmpdLines, $Line; 447 } 448 449 return join "\n", @CmpdLines; 450 } 451 452 # 453 # Using bond blocks, figure out the number of disconnected fragments and 454 # return their values along with the atom numbers in a string delimited by new 455 # line character. 456 # 457 sub GetCmpdFragments { 458 my($CmpdLines) = @_; 459 my($AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, @AtomConnections, $BondType, $FragmentString, $FragmentCount, $LineIndex, $Index, $AtomNum, $NbrAtomNum, @ProcessedAtoms, $ProcessedAtomCount, $ProcessAtomNum, @ProcessingAtoms, @ConnectedAtoms, %Fragments, $FragmentNum, $AFragmentString); 460 461 # Setup the connection table for each atom... 462 @AtomConnections = (); 463 ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]); 464 for $AtomNum (1 .. $AtomCount) { 465 %{$AtomConnections[$AtomNum]} = (); 466 } 467 for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) { 468 ($FirstAtomNum, $SecondAtomNum, $BondType) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]); 469 if (!$AtomConnections[$FirstAtomNum]{$SecondAtomNum}) { 470 $AtomConnections[$FirstAtomNum]{$SecondAtomNum} = $BondType; 471 } 472 if (!$AtomConnections[$SecondAtomNum]{$FirstAtomNum}) { 473 $AtomConnections[$SecondAtomNum]{$FirstAtomNum} = $BondType; 474 } 475 } 476 477 #Get set to count fragments... 478 $ProcessedAtomCount = 0; 479 $FragmentNum = 0; 480 %Fragments = (); 481 @ProcessedAtoms = (); 482 for $AtomNum (1 .. $AtomCount) { 483 $ProcessedAtoms[$AtomNum] = 0; 484 } 485 while ($ProcessedAtomCount < $AtomCount) { 486 @ProcessingAtoms = (); 487 @ConnectedAtoms = (); 488 ATOMNUM: for $AtomNum (1 .. $AtomCount) { 489 if (!$ProcessedAtoms[$AtomNum]) { 490 $ProcessedAtomCount++; 491 $ProcessedAtoms[$AtomNum] = 1; 492 push @ProcessingAtoms, $AtomNum; 493 $FragmentNum++; 494 @{$Fragments{$FragmentNum} } = (); 495 push @{$Fragments{$FragmentNum} }, $AtomNum; 496 last ATOMNUM; 497 } 498 } 499 500 # Go over the neighbors and follow the connection trail while collecting the 501 # atoms numbers present in the connected fragment... 502 while (@ProcessingAtoms) { 503 for ($Index = 0; $Index < @ProcessingAtoms; $Index++) { 504 $ProcessAtomNum = $ProcessingAtoms[$Index]; 505 for $NbrAtomNum (keys %{$AtomConnections[$ProcessAtomNum]}) { 506 if (!$ProcessedAtoms[$NbrAtomNum]) { 507 $ProcessedAtomCount++; 508 $ProcessedAtoms[$NbrAtomNum] = 1; 509 push @ConnectedAtoms, $NbrAtomNum; 510 push @{ $Fragments{$FragmentNum} }, $NbrAtomNum; 511 } 512 } 513 } 514 @ProcessingAtoms = (); 515 @ProcessingAtoms = @ConnectedAtoms; 516 @ConnectedAtoms = (); 517 } 518 } 519 $FragmentCount = $FragmentNum; 520 $FragmentString = ""; 521 522 # Sort out the fragments by size... 523 for $FragmentNum (sort { @{$Fragments{$b}} <=> @{$Fragments{$a}} } keys %Fragments ) { 524 # Sort the atoms in a fragment by their numbers... 525 $AFragmentString = join " ", sort { $a <=> $b } @{ $Fragments{$FragmentNum} }; 526 if ($FragmentString) { 527 $FragmentString .= "\n" . $AFragmentString; 528 } 529 else { 530 $FragmentString = $AFragmentString; 531 } 532 } 533 return ($FragmentCount, $FragmentString); 534 } 535 536 # Count number of lines present in between 4th and line containg "M END" 537 sub GetCtabLinesCount { 538 my($CmpdLines) = @_; 539 my($LineIndex, $CtabLinesCount); 540 541 $CtabLinesCount = 0; 542 LINE: for ($LineIndex = 4; $LineIndex < @$CmpdLines; $LineIndex++) { 543 # 544 # Any line after atom and bond data starting with anything other than space or 545 # a digit indicates end of Ctab atom/bond data block... 546 # 547 if (@$CmpdLines[$LineIndex] !~ /^[0-9 ]/) { 548 $CtabLinesCount = $LineIndex - 4; 549 last LINE; 550 } 551 } 552 return $CtabLinesCount; 553 } 554 555 # Using atom blocks, count the number of atoms which contain special element 556 # symbols not present in the periodic table. 557 sub GetUnknownAtoms { 558 my($CmpdLines) = @_; 559 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines, $LineIndex, $AtomCount, $AtomSymbol); 560 561 $UnknownAtomCount = 0; 562 $UnknownAtoms = ""; 563 $UnknownAtomLines = ""; 564 ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]); 565 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) { 566 ($AtomSymbol) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]); 567 if (!IsElement($AtomSymbol)) { 568 $UnknownAtomCount++; 569 $UnknownAtoms .= " $AtomSymbol"; 570 if ($UnknownAtomLines) { 571 $UnknownAtomLines .= "\n" . @$CmpdLines[$LineIndex]; 572 } 573 else { 574 $UnknownAtomLines = @$CmpdLines[$LineIndex]; 575 } 576 } 577 } 578 return ($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines); 579 } 580 581 # Check z coordinates of all atoms to see whether any of them is non-zero 582 # which makes the compound geometry three dimensional... 583 # 584 sub IsCmpd3D { 585 my($CmpdLines) = @_; 586 my($LineIndex, $AtomCount, $AtomSymbol, $AtomX, $AtomY, $AtomZ); 587 588 ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]); 589 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) { 590 ($AtomSymbol, $AtomX, $AtomY, $AtomZ) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]); 591 if ($AtomZ != 0) { 592 return 1; 593 } 594 } 595 return 0; 596 } 597 598 # Check whether it's a 2D compound... 599 # 600 sub IsCmpd2D { 601 my($CmpdLines) = @_; 602 603 return IsCmpd3D($CmpdLines) ? 0 : 1; 604 } 605 606 # Using bond blocks, count the number of bond lines which contain atom numbers 607 # greater than atom count specified in compound count line... 608 # 609 sub GetInvalidAtomNumbers { 610 my($CmpdLines) = @_; 611 my($LineIndex, $AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, $InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines, $Line, $InvalidAtomPropertyLine, $ValuePairIndex, $AtomNum, $Value, @ValuePairs); 612 613 ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]); 614 615 $InvalidAtomNumbersCount = 0; 616 $InvalidAtomNumbers = ""; 617 $InvalidAtomNumberLines = ""; 618 619 # Go over bond block lines... 620 LINE: for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) { 621 ($FirstAtomNum, $SecondAtomNum) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]); 622 if ($FirstAtomNum <= $AtomCount && $SecondAtomNum <= $AtomCount) { 623 next LINE; 624 } 625 if ($FirstAtomNum > $AtomCount) { 626 $InvalidAtomNumbersCount++; 627 $InvalidAtomNumbers .= " $FirstAtomNum"; 628 } 629 if ($SecondAtomNum > $AtomCount) { 630 $InvalidAtomNumbersCount++; 631 $InvalidAtomNumbers .= " $SecondAtomNum"; 632 } 633 if ($InvalidAtomNumberLines) { 634 $InvalidAtomNumberLines .= "\n" . @$CmpdLines[$LineIndex]; 635 } 636 else { 637 $InvalidAtomNumberLines = @$CmpdLines[$LineIndex]; 638 } 639 } 640 # Go over property lines before M END... 641 # 642 LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) { 643 $Line = @$CmpdLines[$LineIndex]; 644 @ValuePairs = (); 645 if ($Line =~ /^M END/i) { 646 last LINE; 647 } 648 @ValuePairs = (); 649 if ($Line =~ /^M CHG/i) { 650 @ValuePairs = ParseCmpdChargePropertyLine($Line); 651 } 652 elsif ($Line =~ /^M RAD/i) { 653 @ValuePairs = ParseCmpdRadicalPropertyLine($Line); 654 } 655 elsif ($Line =~ /^M ISO/i) { 656 @ValuePairs = ParseCmpdIsotopePropertyLine($Line); 657 } 658 elsif ($Line =~ /^A /i) { 659 my($NextLine); 660 $LineIndex++; 661 $NextLine = @$CmpdLines[$LineIndex]; 662 @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine); 663 } 664 else { 665 next LINE; 666 } 667 668 $InvalidAtomPropertyLine = 0; 669 for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) { 670 $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1]; 671 if ($AtomNum > $AtomCount) { 672 $InvalidAtomPropertyLine = 1; 673 $InvalidAtomNumbersCount++; 674 $InvalidAtomNumbers .= " $AtomNum"; 675 } 676 } 677 if ($InvalidAtomPropertyLine) { 678 if ($InvalidAtomNumberLines) { 679 $InvalidAtomNumberLines .= "\n" . $Line; 680 } 681 else { 682 $InvalidAtomNumberLines = $Line; 683 } 684 } 685 } 686 687 return ($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines); 688 } 689 690 # Ctab lines: Atom block 691 # 692 # Format: xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee 693 # A10 A10 A10 xA3 A2A3 A3 A3 A3 A3 A3 A3 A3 A3 A3 A3 694 # x,y,z: Atom coordinates 695 # aaa: Atom symbol. Entry in periodic table or L for atom list, A, Q, * for unspecified 696 # atom, and LP for lone pair, or R# for Rgroup label 697 # dd: Mass difference. -3, -2, -1, 0, 1, 2, 3, 4 (0 for value beyond these limits) 698 # ccc: Charge. 0 = uncharged or value other than these, 1 = +3, 2 = +2, 3 = +1, 699 # 4 = doublet radical, 5 = -1, 6 = -2, 7 = -3 700 # sss: Atom stereo parity. 0 = not stereo, 1 = odd, 2 = even, 3 = either or unmarked stereo center 701 # hhh: Hydrogen count + 1. 1 = H0, 2 = H1, 3 = H2, 4 = H3, 5 = H4 702 # bbb: Stereo care box. 0 = ignore stereo configuration of this double bond atom, 1 = stereo 703 # configuration of double bond atom must match 704 # vvv: Valence. 0 = no marking (default)(1 to 14) = (1 to 14) 15 = zero valence 705 # HHH: H0 designator. 0 = not specified, 1 = no H atoms allowed (redundant due to hhh) 706 # rrr: Not used 707 # iii: Not used 708 # mmm: Atom-atom mapping number. 1 - number of atoms 709 # nnn: Inversion/retention flag. 0 = property not applied, 1 = configuration is inverted, 710 # 2 = configuration is retained. 711 # eee: Exact change flag. 0 = property not applied, 1 = change on atom must be 712 # exactly as shown 713 # 714 # Notes: 715 # . StereoParity: 1 - ClockwiseStereo, 2 - AntiClockwiseStereo; 3 - Either; 0 - none. These 716 # values determine chirailty around the chiral center; a non zero value indicates atom 717 # has been marked as chiral center. 718 # 719 sub ParseCmpdAtomLine { 720 my($Line) = @_; 721 my ($LineIndex, $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity); 722 723 ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = ('') x 7; 724 if (length($Line) > 31) { 725 ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = unpack("A10A10A10xA3A2A3A3", $Line); 726 } 727 else { 728 ($AtomX, $AtomY, $AtomZ, $AtomSymbol) = unpack("A10A10A10", $Line); 729 } 730 return ($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity); 731 } 732 733 # Map MDL charge value used in SD and MOL files to internal charge used by MayaChemTools. 734 # 735 sub MDLChargeToInternalCharge { 736 my($MDLCharge) = @_; 737 my($InternalCharge); 738 739 CHARGE: { 740 if ($MDLCharge == 0) { $InternalCharge = 0; last CHARGE;} 741 if ($MDLCharge == 1) { $InternalCharge = 3; last CHARGE;} 742 if ($MDLCharge == 2) { $InternalCharge = 2; last CHARGE;} 743 if ($MDLCharge == 3) { $InternalCharge = 1; last CHARGE;} 744 if ($MDLCharge == 5) { $InternalCharge = -1; last CHARGE;} 745 if ($MDLCharge == 6) { $InternalCharge = -2; last CHARGE;} 746 if ($MDLCharge == 7) { $InternalCharge = -3; last CHARGE;} 747 # All other MDL charge values, including 4 corresponding to "doublet radical", 748 # are assigned internal value of 0. 749 $InternalCharge = 0; 750 if ($MDLCharge != 4) { 751 carp "Warning: MDLChargeToInternalCharge: MDL charge value, $MDLCharge, is not supported: An internal charge value, 0, has been assigned..."; 752 } 753 } 754 return $InternalCharge; 755 } 756 757 # Map internal charge used by MayaChemTools to MDL charge value used in SD and MOL files. 758 # 759 sub InternalChargeToMDLCharge { 760 my($InternalCharge) = @_; 761 my($MDLCharge); 762 763 CHARGE: { 764 if ($InternalCharge == 3) { $MDLCharge = 1; last CHARGE;} 765 if ($InternalCharge == 2) { $MDLCharge = 2; last CHARGE;} 766 if ($InternalCharge == 1) { $MDLCharge = 3; last CHARGE;} 767 if ($InternalCharge == -1) { $MDLCharge = 5; last CHARGE;} 768 if ($InternalCharge == -2) { $MDLCharge = 6; last CHARGE;} 769 if ($InternalCharge == -3) { $MDLCharge = 7; last CHARGE;} 770 # All other MDL charge values, including 4 corresponding to "doublet radical", 771 # are assigned internal value of 0. 772 $MDLCharge = 0; 773 } 774 return $MDLCharge; 775 } 776 777 # Ctab lines: Bond block 778 # 779 # Format: 111222tttsssxxxrrrccc 780 # 781 # 111: First atom number. 782 # 222: Second atom number. 783 # ttt: Bond type. 1 = Single, 2 = Double, 3 = Triple, 4 = Aromatic, 5 = Single or Double, 784 # 6 = Single or Aromatic, 7 = Double or Aromatic, 8 = Any 785 # sss: Bond stereo. Single bonds: 0 = not stereo, 1 = Up, 4 = Either, 6 = Down, 786 # Double bonds: 0 = Use x-, y-, z-coords from atom block to determine cis or trans, 787 # 3 = Cis or trans (either) double bond 788 # xxx: Not used 789 # rrr: Bond topology. 0 = Either, 1 = Ring, 2 = Chain 790 # ccc: Reacting center status. 0 = unmarked, 1 = a center, -1 = not a center, 791 # Additional: 2 = no change,4 = bond made/broken, 8 = bond order changes 12 = 4+8 792 # (both made/broken and changes); 5 = (4 + 1), 9 = (8 + 1), and 13 = (12 + 1) are also possible 793 # 794 sub ParseCmpdBondLine { 795 my($Line) = @_; 796 my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo); 797 798 ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = map {s/ //g; $_} unpack("A3A3A3A3", $Line); 799 return ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo); 800 } 801 802 # Map MDL bond type value used in SD and MOL files to internal bond order and bond types 803 # values used by MayaChemTools... 804 # 805 sub MDLBondTypeToInternalBondOrder { 806 my($MDLBondType) = @_; 807 my($InternalBondOrder, $InternalBondType); 808 809 $InternalBondType = ''; 810 811 BONDTYPE: { 812 if ($MDLBondType == 1) { $InternalBondOrder = 1; $InternalBondType = 'Single'; last BONDTYPE;} 813 if ($MDLBondType == 2) { $InternalBondOrder = 2; $InternalBondType = 'Double'; last BONDTYPE;} 814 if ($MDLBondType == 3) { $InternalBondOrder = 3; $InternalBondType = 'Triple'; last BONDTYPE;} 815 if ($MDLBondType == 4) { $InternalBondOrder = 1.5; $InternalBondType = 'Aromatic'; last BONDTYPE;} # Aromatic 816 if ($MDLBondType == 5) { $InternalBondOrder = 1; $InternalBondType = 'SingleOrDouble'; last BONDTYPE;} # Aromatic 817 if ($MDLBondType == 6) { $InternalBondOrder = 1; $InternalBondType = 'SingleOrAromatic'; last BONDTYPE;} # Aromatic 818 if ($MDLBondType == 7) { $InternalBondOrder = 2; $InternalBondType = 'DoubleOrAromatic'; last BONDTYPE;} # Aromatic 819 if ($MDLBondType == 8) { $InternalBondOrder = 1; $InternalBondType = 'Any'; last BONDTYPE;} # Aromatic 820 # 821 # Although MDL aromatic bond values are used for query only and explicit Kekule bond order 822 # values must be assigned, internal value of 1.5 is allowed to indicate aromatic bond orders. 823 # 824 # All other MDL bond type values - 5 = Single or Double, 6 = Single or Aromatic, 7 = Double or Aromatic, 825 # 8 = Any - are also assigned appropriate internal value of 1: These are meant to be used for 826 # structure queries by MDL products. 827 # 828 $InternalBondOrder = 1; 829 $InternalBondType = 'Single'; 830 831 carp "Warning: MDLBondTypeToInternalBondOrder: MDL bond type value, $MDLBondType, is not supported: An internal bond order value, 0, has been assigned..."; 832 } 833 return ($InternalBondOrder, $InternalBondType); 834 } 835 836 # Map internal bond order and bond type values used by MayaChemTools to MDL bond type value used 837 # in SD and MOL files... 838 # 839 sub InternalBondOrderToMDLBondType { 840 my($InternalBondOrder, $InternalBondType) = @_; 841 my($MDLBondType); 842 843 BONDTYPE: { 844 if ($InternalBondOrder == 1) { 845 if ($InternalBondType =~ /^SingleOrDouble$/i) { 846 $MDLBondType = 5; 847 } 848 elsif ($InternalBondType =~ /^SingleOrAromatic$/i) { 849 $MDLBondType = 6; 850 } 851 elsif ($InternalBondType =~ /^Any$/i) { 852 $MDLBondType = 8; 853 } 854 else { 855 $MDLBondType = 1; 856 } 857 $MDLBondType = 1; 858 last BONDTYPE; 859 } 860 if ($InternalBondOrder == 2) { 861 if ($InternalBondType =~ /^DoubleOrAromatic$/i) { 862 $MDLBondType = 7; 863 } 864 else { 865 $MDLBondType = 2; 866 } 867 last BONDTYPE; 868 } 869 if ($InternalBondOrder == 3) { $MDLBondType = 3; last BONDTYPE;} 870 if ($InternalBondOrder == 1.5) { $MDLBondType = 4; last BONDTYPE;} 871 if ($InternalBondType =~ /^Any$/i) { $MDLBondType = 8; last BONDTYPE;} 872 873 $MDLBondType = 1; 874 875 carp "Warning: InternalBondOrderToMDLBondType: Internal bond order and type values, $InternalBondOrder and $InternalBondType, don't match any valid MDL bond type: MDL bond type value, 1, has been assigned..."; 876 } 877 return $MDLBondType; 878 } 879 880 # Third line: Comments - A blank line is also allowed. 881 sub ParseCmpdCommentsLine { 882 my($Line) = @_; 883 my($Comments); 884 885 $Comments = unpack("A80", $Line); 886 887 return ($Comments); 888 } 889 890 # Map MDL bond stereo value used in SD and MOL files to internal bond stereochemistry values used by MayaChemTools... 891 # 892 sub MDLBondStereoToInternalBondStereochemistry { 893 my($MDLBondStereo) = @_; 894 my($InternalBondStereo); 895 896 $InternalBondStereo = ''; 897 898 BONDSTEREO: { 899 if ($MDLBondStereo == 1) { $InternalBondStereo = 'Up'; last BONDSTEREO;} 900 if ($MDLBondStereo == 4) { $InternalBondStereo = 'UpOrDown'; last BONDSTEREO;} 901 if ($MDLBondStereo == 6) { $InternalBondStereo = 'Down'; last BONDSTEREO;} 902 if ($MDLBondStereo == 3) { $InternalBondStereo = 'CisOrTrans'; last BONDSTEREO;} 903 if ($MDLBondStereo == 0) { $InternalBondStereo = 'None'; last BONDSTEREO;} 904 905 $InternalBondStereo = ''; 906 carp "Warning: MDLBondStereoToInternalBondType: MDL bond stereo value, $MDLBondStereo, is not supported: It has been ignored and bond order would be used to determine bond type..."; 907 } 908 return $InternalBondStereo; 909 } 910 911 # Map internal bond stereochemistry values used by MayaChemTools to MDL bond stereo value used in SD and MOL files... 912 # 913 sub InternalBondStereochemistryToMDLBondStereo { 914 my($InternalBondStereo) = @_; 915 my($MDLBondStereo); 916 917 $MDLBondStereo = 0; 918 919 BONDSTEREO: { 920 if ($InternalBondStereo =~ /^Up$/i) { $MDLBondStereo = 1; last BONDSTEREO;} 921 if ($InternalBondStereo =~ /^UpOrDown$/i) { $MDLBondStereo = 4; last BONDSTEREO;} 922 if ($InternalBondStereo =~ /^Down$/) { $MDLBondStereo = 6; last BONDSTEREO;} 923 if ($InternalBondStereo =~ /^CisOrTrans$/) { $MDLBondStereo = 3; last BONDSTEREO;} 924 925 $MDLBondStereo = 0; 926 } 927 return $MDLBondStereo; 928 } 929 930 # Fourth line: Counts 931 # 932 # Format: aaabbblllfffcccsssxxxrrrpppiiimmmvvvvvv 933 # 934 # aaa: number of atoms; bbb: number of bonds; lll: number of atom lists; fff: (obsolete) 935 # ccc: chiral flag: 0=not chiral, 1=chiral; sss: number of stext entries; xxx,rrr,ppp,iii: 936 # (obsolete); mmm: number of lines of additional properties, including the M END line, No 937 # longer supported, default is set to 999; vvvvvv: version 938 939 sub ParseCmpdCountsLine { 940 my($Line) = @_; 941 my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version); 942 943 if (length($Line) >= 39) { 944 ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = unpack("A3A3x3x3A3x3x3x3x3x3A3A6", $Line); 945 } 946 elsif (length($Line) >= 15) { 947 ($PropertyCount, $Version) = ("999", "v2000"); 948 ($AtomCount, $BondCount, $ChiralFlag) = unpack("A3A3x3x3A3", $Line); 949 } 950 else { 951 ($ChiralFlag, $PropertyCount, $Version) = ("0", "999", "v2000"); 952 ($AtomCount, $BondCount) = unpack("A3A3", $Line); 953 } 954 955 if ($Version =~ /V3000/i) { 956 # Current version of MayaChemTools modules and classes for processing MDL MOL and SD don't support 957 # V3000. So instead of relying on callers, just exit with an error to disable any processing of V3000 958 # format. 959 croak "Error: SDFileUtil::ParseCmpdCountsLine: The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools..."; 960 } 961 962 return ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version); 963 } 964 965 # Second line: Misc info 966 # 967 # Format: IIPPPPPPPPMMDDYYHHmmddSSssssssssssEEEEEEEEEEEERRRRRR 968 # A2A8 A10 A2I2A10 A12 A6 969 # User's first and last initials (I), program name (P), date/time (M/D/Y,H:m), 970 # dimensional codes - 2D or 3D (d),scaling factors (S, s), energy (E) if modeling program input, 971 # internal registry number (R) if input through MDL form. A blank line is also allowed. 972 sub ParseCmpdMiscInfoLine { 973 my($Line) = @_; 974 my($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum); 975 976 ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum) = unpack("A2A8A10A2A2A10A12A6", $Line); 977 return ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum); 978 } 979 980 # First line: Molecule name. This line is unformatted, but like all other lines in a 981 # molfile may not extend beyond column 80. A blank line is also allowed. 982 sub ParseCmpdMolNameLine { 983 my($Line) = @_; 984 my($MolName); 985 986 $MolName = unpack("A80", $Line); 987 988 return ($MolName); 989 } 990 991 # Parse atom alias property line in CTAB generic properties block. 992 # 993 # Atom alias property line format: 994 # 995 # A aaa 996 # x... 997 # 998 # aaa: Atom number 999 # x: Atom alias in next line 1000 # 1001 sub ParseCmpdAtomAliasPropertyLine { 1002 my($Line, $NextLine) = @_; 1003 my($Label, $AtomNumber, $AtomAlias); 1004 1005 ($Label, $AtomNumber) = split(' ', $Line); 1006 $AtomAlias = $NextLine; 1007 1008 if (!$AtomAlias) { 1009 carp "Warning: _ParseCmpdAtomAliasPropertyLine: No atom alias value specified on the line following atom alias property line..."; 1010 } 1011 1012 return ($AtomNumber, $AtomAlias); 1013 } 1014 1015 # Parse charge property line in CTAB generic properties block. 1016 # 1017 # Charge property line format: 1018 # 1019 # M CHGnn8 aaa vvv ... 1020 # 1021 # nn8: Number of value pairs. Maximum of 8 pairs allowed. 1022 # aaa: Atom number 1023 # vvv: -15 to +15. Default of 0 = uncharged atom. When present, this property supersedes 1024 # all charge and radical values in the atom block, forcing a 0 charge on all atoms not 1025 # listed in an M CHG or M RAD line. 1026 # 1027 sub ParseCmpdChargePropertyLine { 1028 my($Line) = @_; 1029 1030 return _ParseCmpdGenericPropertyLine('Charge', $Line); 1031 } 1032 1033 1034 # Parse isotope property line in CTAB generic properties block. 1035 # 1036 # Isoptope property line format: 1037 # 1038 # M ISOnn8 aaa vvv ... 1039 # 1040 # nn8: Number of value paris. Maximum of 8 pairs allowed. 1041 # aaa: Atom number 1042 # vvv: Absolute mass of the atom isotope as a positive integer. When present, this property 1043 # supersedes all isotope values in the atom block. Default (no entry) means natural 1044 # abundance. The difference between this absolute mass value and the natural 1045 # abundance value specified in the PTABLE.DAT file must be within the range of -18 1046 # to +12 1047 # 1048 # Notes: 1049 # . Values correspond to mass numbers... 1050 # 1051 sub ParseCmpdIsotopePropertyLine { 1052 my($Line) = @_; 1053 1054 return _ParseCmpdGenericPropertyLine('Isotope', $Line); 1055 } 1056 1057 # Parse radical property line in CTAB generic properties block. 1058 # 1059 # Radical property line format: 1060 # 1061 # M RADnn8 aaa vvv ... 1062 # 1063 # nn8: Number of value paris. Maximum of 8 pairs allowed. 1064 # aaa: Atom number 1065 # vvv: Default of 0 = no radical, 1 = singlet, 2 = doublet, 3 = triplet . When 1066 # present, this property supersedes all charge and radical values in the atom block, 1067 # forcing a 0 (zero) charge and radical on all atoms not listed in an M CHG or 1068 # M RAD line. 1069 # 1070 sub ParseCmpdRadicalPropertyLine { 1071 my($Line) = @_; 1072 1073 return _ParseCmpdGenericPropertyLine('Radical', $Line); 1074 } 1075 1076 # Map MDL radical stereo value used in SD and MOL files to internal spin multiplicity values used by MayaChemTools... 1077 # 1078 sub MDLRadicalToInternalSpinMultiplicity { 1079 my($MDLRadical) = @_; 1080 my($InternalSpinMultiplicity); 1081 1082 $InternalSpinMultiplicity = ''; 1083 1084 SPINMULTIPLICITY: { 1085 if ($MDLRadical == 0) { $InternalSpinMultiplicity = 0; last SPINMULTIPLICITY;} 1086 if ($MDLRadical == 1) { $InternalSpinMultiplicity = 1; last SPINMULTIPLICITY;} 1087 if ($MDLRadical == 2) { $InternalSpinMultiplicity = 2; last SPINMULTIPLICITY;} 1088 if ($MDLRadical == 3) { $InternalSpinMultiplicity = 3; last SPINMULTIPLICITY;} 1089 $InternalSpinMultiplicity = ''; 1090 carp "Warning: MDLRadicalToInternalSpinMultiplicity: MDL radical value, $MDLRadical, specifed on line M RAD is not supported..."; 1091 } 1092 return $InternalSpinMultiplicity; 1093 } 1094 1095 # Map internal spin multiplicity values used by MayaChemTools to MDL radical stereo value used in SD and MOL files... 1096 # 1097 sub InternalSpinMultiplicityToMDLRadical { 1098 my($InternalSpinMultiplicity) = @_; 1099 my($MDLRadical); 1100 1101 $MDLRadical = 0; 1102 1103 SPINMULTIPLICITY: { 1104 if ($InternalSpinMultiplicity == 1) { $MDLRadical = 1; last SPINMULTIPLICITY;} 1105 if ($InternalSpinMultiplicity == 2) { $MDLRadical = 2; last SPINMULTIPLICITY;} 1106 if ($InternalSpinMultiplicity == 3) { $MDLRadical = 3; last SPINMULTIPLICITY;} 1107 $MDLRadical = 0; 1108 } 1109 return $MDLRadical; 1110 } 1111 1112 # Process generic CTAB property line... 1113 sub _ParseCmpdGenericPropertyLine { 1114 my($PropertyName, $Line) = @_; 1115 1116 my($Label, $PropertyLabel, $ValuesCount, $ValuePairsCount, @ValuePairs); 1117 1118 @ValuePairs = (); 1119 ($Label, $PropertyLabel, $ValuesCount, @ValuePairs) = split(' ', $Line); 1120 $ValuePairsCount = (scalar @ValuePairs)/2; 1121 if ($ValuesCount != $ValuePairsCount) { 1122 carp "Warning: _ParseCmpdGenericPropertyLine: Number of atom number and $PropertyName value paris specified on $Label $PropertyLabel property line, $ValuePairsCount, does not match expected value of $ValuesCount..."; 1123 } 1124 1125 return (@ValuePairs); 1126 } 1127 1128 # Generic CTAB property lines for charge, istope and radical properties... 1129 # 1130 sub _GenerateCmpdGenericPropertyLines { 1131 my($PropertyName, $PropertyValuePairsRef) = @_; 1132 my($Index, $PropertyLabel, $Line, $PropertyCount, $AtomNum, $PropertyValue, @PropertyLines); 1133 1134 @PropertyLines = (); 1135 NAME: { 1136 if ($PropertyName =~ /^Charge$/i) { $PropertyLabel = "M CHG"; last NAME; } 1137 if ($PropertyName =~ /^Isotope$/i) { $PropertyLabel = "M ISO"; last NAME; } 1138 if ($PropertyName =~ /^Radical$/i) { $PropertyLabel = "M RAD"; last NAME; } 1139 carp "Warning: _GenerateCmpdGenericPropertyLines: Unknown property name, $PropertyName, specified..."; 1140 return @PropertyLines; 1141 } 1142 1143 # A maximum of 8 property pair values allowed per line... 1144 $PropertyCount = 0; 1145 $Line = ''; 1146 for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) { 1147 if ($PropertyCount > 8) { 1148 # Setup property line... 1149 $Line = "${PropertyLabel} 8${Line}"; 1150 push @PropertyLines, $Line; 1151 1152 $PropertyCount = 0; 1153 $Line = ''; 1154 } 1155 $PropertyCount++; 1156 $AtomNum = $PropertyValuePairsRef->[$Index]; 1157 $PropertyValue = $PropertyValuePairsRef->[$Index + 1]; 1158 $Line .= sprintf " %3i %3i", $AtomNum, $PropertyValue; 1159 } 1160 if ($Line) { 1161 $Line = "${PropertyLabel} ${PropertyCount}${Line}"; 1162 push @PropertyLines, $Line; 1163 } 1164 return @PropertyLines; 1165 } 1166 1167 # 1168 # Read compound data into a string and return its value 1169 sub ReadCmpdString { 1170 my($SDFileRef) = @_; 1171 my($CmpdString); 1172 1173 $CmpdString = ""; 1174 LINE: while (defined($_ = <$SDFileRef>)) { 1175 # Change Windows and Mac new line char to UNIX... 1176 s/(\r\n)|(\r)/\n/g; 1177 1178 if (/^\$\$\$\$/) { 1179 # Take out any new line char at the end by explicitly removing it instead of using 1180 # chomp, which might not always work correctly on files generated on a system 1181 # with a value of input line separator different from the current system... 1182 s/\n$//g; 1183 1184 # Doesn't hurt to chomp... 1185 chomp; 1186 1187 $CmpdString .= $_; 1188 last LINE; 1189 } 1190 else { 1191 $CmpdString .= $_; 1192 } 1193 } 1194 return $CmpdString; 1195 } 1196 1197 # Find out the number of fragements in the compounds. And for the compound with 1198 # more than one fragment, remove all the others besides the largest one. 1199 sub WashCmpd { 1200 my($CmpdLines) = @_; 1201 my($WashedCmpdString, $FragmentCount, $Fragments); 1202 1203 $WashedCmpdString = ""; 1204 ($FragmentCount, $Fragments) = GetCmpdFragments($CmpdLines); 1205 if ($FragmentCount > 1) { 1206 # Go over the compound data for the largest fragment including property 1207 # data... 1208 my (@AllFragments, @LargestFragment, %LargestFragmentAtoms, @WashedCmpdLines, $Index, $LineIndex, $AtomCount, $BondCount, $NewAtomCount, $NewBondCount, $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo, $FirstNewAtomNum, $SecondNewAtomNum, $AtomNum, $ChiralFlag, $BondLine, $MENDLineIndex, $Line, $Value, @ValuePairs, @NewValuePairs, $ValuePairIndex, $NewAtomNum, @NewPropertyLines); 1209 1210 @AllFragments = (); @LargestFragment = (); 1211 %LargestFragmentAtoms = (); 1212 @AllFragments = split "\n", $Fragments; 1213 @LargestFragment = split " ", $AllFragments[0]; 1214 for $Index (0 .. $#LargestFragment) { 1215 # Map old atom numbers to new atom numbers as the fragment atom numbers are sorted 1216 # from lowest to highest old atom numbers... 1217 $LargestFragmentAtoms{$LargestFragment[$Index]} = $Index + 1; 1218 } 1219 @WashedCmpdLines = (); 1220 push @WashedCmpdLines, @$CmpdLines[0], @$CmpdLines[1], @$CmpdLines[2], @$CmpdLines[3]; 1221 ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine(@$CmpdLines[3]); 1222 $NewAtomCount = @LargestFragment; 1223 $NewBondCount = 0; 1224 $AtomNum = 0; 1225 # Retrieve the largest fragment atom lines... 1226 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) { 1227 $AtomNum++; 1228 if ($LargestFragmentAtoms{$AtomNum}) { 1229 push @WashedCmpdLines, @$CmpdLines[$LineIndex]; 1230 } 1231 } 1232 # Retrieve the largest fragment bond lines... 1233 for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) { 1234 ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]); 1235 if ($LargestFragmentAtoms{$FirstAtomNum} && $LargestFragmentAtoms{$SecondAtomNum}) { 1236 $NewBondCount++; 1237 # Set up bond line with new atom number mapping... 1238 $FirstNewAtomNum = $LargestFragmentAtoms{$FirstAtomNum}; 1239 $SecondNewAtomNum = $LargestFragmentAtoms{$SecondAtomNum}; 1240 $BondLine = GenerateCmpdBondLine($FirstNewAtomNum, $SecondNewAtomNum, $BondType, $BondStereo); 1241 push @WashedCmpdLines, $BondLine; 1242 } 1243 } 1244 # Get property lines for CHG, ISO and RAD label and map the old atom numbers to new 1245 # atom numners; Others, property lines before M END line are skipped as atom numbers for 1246 # other properties might not valid anymore... 1247 # 1248 $MENDLineIndex = $LineIndex; 1249 LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) { 1250 $Line = @$CmpdLines[$LineIndex]; 1251 if ($Line =~ /^M END/i) { 1252 push @WashedCmpdLines, "M END"; 1253 $MENDLineIndex = $LineIndex; 1254 last LINE; 1255 } 1256 1257 @ValuePairs = (); 1258 if ($Line =~ /^M CHG/i) { 1259 @ValuePairs = ParseCmpdChargePropertyLine($Line); 1260 } 1261 elsif ($Line =~ /^M RAD/i) { 1262 @ValuePairs = ParseCmpdRadicalPropertyLine($Line); 1263 } 1264 elsif ($Line =~ /^M ISO/i) { 1265 @ValuePairs = ParseCmpdIsotopePropertyLine($Line); 1266 } 1267 elsif ($Line =~ /^A /i) { 1268 my($NextLine); 1269 $LineIndex++; 1270 $NextLine = @$CmpdLines[$LineIndex]; 1271 @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine); 1272 } 1273 else { 1274 next LINE; 1275 } 1276 1277 if (!@ValuePairs) { 1278 next LINE; 1279 } 1280 1281 # Collect values for valid atom numbers with mapping to new atom numbers... 1282 @NewValuePairs = (); 1283 VALUEINDEX: for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) { 1284 $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1]; 1285 if (!exists $LargestFragmentAtoms{$AtomNum}) { 1286 next VALUEINDEX; 1287 } 1288 $NewAtomNum = $LargestFragmentAtoms{$AtomNum}; 1289 push @NewValuePairs, ($NewAtomNum, $Value) 1290 } 1291 if (!@NewValuePairs) { 1292 next LINE; 1293 } 1294 @NewPropertyLines = (); 1295 if ($Line =~ /^M CHG/i) { 1296 @NewPropertyLines = GenerateCmpdChargePropertyLines(\@NewValuePairs); 1297 } 1298 elsif ($Line =~ /^M RAD/i) { 1299 @NewPropertyLines = GenerateCmpdRadicalPropertyLines(\@NewValuePairs); 1300 } 1301 elsif ($Line =~ /^M ISO/i) { 1302 @NewPropertyLines = GenerateCmpdIsotopePropertyLines(\@NewValuePairs); 1303 } 1304 elsif ($Line =~ /^A /i) { 1305 @NewPropertyLines = GenerateCmpdAtomAliasPropertyLines(\@NewValuePairs); 1306 } 1307 push @WashedCmpdLines, @NewPropertyLines; 1308 } 1309 1310 # Retrieve rest of the data label and value property data... 1311 for ($LineIndex = (1 + $MENDLineIndex); $LineIndex < @$CmpdLines; $LineIndex++) { 1312 push @WashedCmpdLines, @$CmpdLines[$LineIndex]; 1313 } 1314 # Update atom and bond count line... 1315 $WashedCmpdLines[3] = GenerateCmpdCountsLine($NewAtomCount, $NewBondCount, $ChiralFlag); 1316 1317 $WashedCmpdString = join "\n", @WashedCmpdLines; 1318 } 1319 return ($FragmentCount, $Fragments, $WashedCmpdString); 1320 } 1321