MayaChemTools

   1 package SDFileUtil;
   2 #
   3 # File: SDFileUtil.pm
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use Exporter;
  28 use Carp;
  29 use PeriodicTable qw(IsElement);
  30 use TimeUtil ();
  31 
  32 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  33 
  34 @ISA = qw(Exporter);
  35 @EXPORT = qw(GenerateCmpdAtomLine GenerateCmpdBondLine GenerateCmpdChargePropertyLines GenerateCmpdCommentsLine GenerateCmpdCountsLine GenerateCmpdAtomAliasPropertyLines GenerateCmpdIsotopePropertyLines GenerateCmpdDataHeaderLabelsAndValuesLines GenerateCmpdMiscInfoLine GenerateCmpdRadicalPropertyLines GenerateCmpdMolNameLine GenerateEmptyCtabBlockLines GenerateMiscLineDateStamp GetAllAndCommonCmpdDataHeaderLabels GetCmpdDataHeaderLabels GetCmpdDataHeaderLabelsAndValues GetCmpdFragments GetCtabLinesCount GetUnknownAtoms GetInvalidAtomNumbers MDLChargeToInternalCharge InternalChargeToMDLCharge MDLBondTypeToInternalBondOrder InternalBondOrderToMDLBondType MDLBondStereoToInternalBondStereochemistry InternalBondStereochemistryToMDLBondStereo InternalSpinMultiplicityToMDLRadical MDLRadicalToInternalSpinMultiplicity IsCmpd3D IsCmpd2D ParseCmpdAtomLine ParseCmpdBondLine ParseCmpdCommentsLine ParseCmpdCountsLine ParseCmpdMiscInfoLine ParseCmpdMolNameLine ParseCmpdAtomAliasPropertyLine ParseCmpdChargePropertyLine ParseCmpdIsotopePropertyLine ParseCmpdRadicalPropertyLine ReadCmpdString RemoveCmpdDataHeaderLabelAndValue WashCmpd);
  36 @EXPORT_OK = qw();
  37 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  38 
  39 # Format data for compounds count line...
  40 sub GenerateCmpdCountsLine {
  41   my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version, $Line);
  42 
  43   if (@_ == 5) {
  44     ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = @_;
  45   }
  46   elsif (@_ == 3) {
  47     ($AtomCount, $BondCount, $ChiralFlag) = @_;
  48     $PropertyCount = 999;
  49     $Version = "V2000";
  50   }
  51   else {
  52     ($AtomCount, $BondCount) = @_;
  53     $ChiralFlag = 0;
  54     $PropertyCount = 999;
  55     $Version = "V2000";
  56   }
  57   if ($AtomCount > 999) {
  58     croak "Error: SDFileUtil::GenerateCmpdCountsLine: The atom count, $AtomCount, exceeds maximum of 999 allowed for CTAB version 2000. The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools...";
  59   }
  60   $Line = sprintf "%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%6s", $AtomCount, $BondCount, 0, 0, $ChiralFlag, 0, 0, 0, 0, 0, $PropertyCount, $Version;
  61 
  62   return ($Line);
  63 }
  64 
  65 # Generate comments line...
  66 sub GenerateCmpdCommentsLine {
  67   my($Comments) = @_;
  68   my($Line);
  69 
  70   $Line = (length($Comments) > 80) ? substr($Comments, 0, 80) : $Comments;
  71 
  72   return $Line;
  73 }
  74 
  75 # Generate molname line...
  76 sub GenerateCmpdMolNameLine {
  77   my($MolName) = @_;
  78   my($Line);
  79 
  80   $Line = (length($MolName) > 80) ? substr($MolName, 0, 80) : $MolName;
  81 
  82   return $Line;
  83 }
  84 
  85 # Generate data for compounds misc info line...
  86 sub GenerateCmpdMiscInfoLine {
  87   my($ProgramName, $UserInitial, $Code) = @_;
  88   my($Date, $Line);
  89 
  90   if (!(defined($ProgramName) && $ProgramName)) {
  91     $ProgramName = "MayaChem";
  92   }
  93   if (!(defined($UserInitial) && $UserInitial)) {
  94     $UserInitial = "  ";
  95   }
  96   if (!(defined($Code) && $Code)) {
  97     $Code = "2D";
  98   }
  99 
 100   if (length($ProgramName) > 8) {
 101     $ProgramName = substr($ProgramName, 0, 8);
 102   }
 103   if (length($UserInitial) > 2) {
 104     $UserInitial = substr($UserInitial, 0, 2);
 105   }
 106   if (length($Code) > 2) {
 107     $Code = substr($Code, 0, 2);
 108   }
 109   $Date = GenerateMiscLineDateStamp();
 110 
 111   $Line = "${UserInitial}${ProgramName}${Date}${Code}";
 112 
 113   return $Line;
 114 }
 115 
 116 # Generate data for compounds misc info line...
 117 sub GenerateEmptyCtabBlockLines {
 118   my($Date, $Lines);
 119 
 120   if (@_ == 1) {
 121     ($Date) = @_;
 122   }
 123   else {
 124     $Date = GenerateMiscLineDateStamp();
 125   }
 126   # First line: Blank molname line...
 127   # Second line: Misc info...
 128   # Third line: Blank comments line...
 129   # Fourth line: Counts line reflecting empty structure data block...
 130   $Lines = "\n";
 131   $Lines .= "  MayaChem${Date}2D\n";
 132   $Lines .= "\n";
 133   $Lines .= GenerateCmpdCountsLine(0, 0, 0) . "\n";
 134   $Lines .= "M  END";
 135 
 136   return $Lines;
 137 }
 138 
 139 # Generate SD file data stamp...
 140 sub GenerateMiscLineDateStamp {
 141   return TimeUtil::SDFileTimeStamp();
 142 }
 143 
 144 # Generate data for compound atom line...
 145 #
 146 sub GenerateCmpdAtomLine {
 147   my($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity) = @_;
 148   my($Line);
 149 
 150   if (!defined $MassDifference) {
 151     $MassDifference = 0;
 152   }
 153   if (!defined $Charge) {
 154     $Charge = 0;
 155   }
 156   if (!defined $StereoParity) {
 157     $StereoParity = 0;
 158   }
 159   $Line = sprintf "%10.4f%10.4f%10.4f %-3s%2i%3i%3i  0  0  0  0  0  0  0  0  0", $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity;
 160 
 161   return $Line
 162 }
 163 
 164 # Generate data for compound bond line...
 165 #
 166 sub GenerateCmpdBondLine {
 167   my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = @_;
 168   my($Line);
 169 
 170   if (!defined $BondStereo) {
 171     $BondStereo = 0;
 172   }
 173   $Line = sprintf "%3i%3i%3i%3i  0  0  0", $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo;
 174 
 175   return $Line
 176 }
 177 
 178 # Generate charge property lines for CTAB block...
 179 #
 180 sub GenerateCmpdChargePropertyLines {
 181   my($ChargeValuePairsRef) = @_;
 182 
 183   return _GenerateCmpdGenericPropertyLines('Charge', $ChargeValuePairsRef);
 184 }
 185 
 186 # Generate isotope property lines for CTAB block...
 187 #
 188 sub GenerateCmpdIsotopePropertyLines {
 189   my($IsotopeValuePairsRef) = @_;
 190 
 191   return _GenerateCmpdGenericPropertyLines('Isotope', $IsotopeValuePairsRef);
 192 }
 193 
 194 # Generate radical property line property lines for CTAB block...
 195 #
 196 sub GenerateCmpdRadicalPropertyLines {
 197   my($RadicalValuePairsRef) = @_;
 198 
 199   return _GenerateCmpdGenericPropertyLines('Radical', $RadicalValuePairsRef);
 200 }
 201 
 202 # Generate atom alias property line property lines for CTAB block...
 203 #
 204 # Atom alias property line format:
 205 #
 206 # A  aaa
 207 # x...
 208 #
 209 #    aaa: Atom number
 210 #    x: Atom alias in next line
 211 #
 212 sub GenerateCmpdAtomAliasPropertyLines {
 213   my($PropertyValuePairsRef) = @_;
 214   my($Index, $AtomNum, $AtomAlias, $Line, @PropertyLines);
 215 
 216   @PropertyLines = ();
 217 
 218   for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) {
 219     $AtomNum = $PropertyValuePairsRef->[$Index];
 220     $AtomAlias = $PropertyValuePairsRef->[$Index + 1];
 221 
 222     $Line = "A  " . sprintf "%3i", $AtomNum;
 223 
 224     push @PropertyLines, $Line;
 225     push @PropertyLines, $AtomAlias;
 226   }
 227 
 228   return @PropertyLines;
 229 }
 230 
 231 # Generate data header labels and values lines...
 232 #
 233 sub GenerateCmpdDataHeaderLabelsAndValuesLines {
 234   my($DataHeaderLabelsRef, $DataHeaderLabelsAndValuesRef, $SortDataLabels) = @_;
 235   my($DataLabel, $DataValue, @DataLabels, @DataLines);
 236 
 237   if (!defined $SortDataLabels) {
 238     $SortDataLabels = 0;
 239   }
 240 
 241   @DataLines = ();
 242   @DataLabels = ();
 243   if ($SortDataLabels) {
 244     push @DataLabels, sort @{$DataHeaderLabelsRef};
 245   }
 246   else {
 247     push @DataLabels,  @{$DataHeaderLabelsRef};
 248   }
 249   for $DataLabel (@DataLabels) {
 250     $DataValue = '';
 251     if (exists $DataHeaderLabelsAndValuesRef->{$DataLabel}) {
 252       $DataValue = $DataHeaderLabelsAndValuesRef->{$DataLabel};
 253     }
 254     push @DataLines, (">  <${DataLabel}>", "$DataValue", "");
 255   }
 256   return @DataLines;
 257 }
 258 
 259 # Parse data field header in SD file and return lists of all and common data field
 260 # labels.
 261 sub GetAllAndCommonCmpdDataHeaderLabels {
 262   my($SDFileRef) = @_;
 263   my($CmpdCount, $CmpdString, $Label, @CmpdLines, @DataFieldLabels, @CommonDataFieldLabels, %DataFieldLabelsMap);
 264 
 265   $CmpdCount = 0;
 266   @DataFieldLabels = ();
 267   @CommonDataFieldLabels = ();
 268   %DataFieldLabelsMap = ();
 269 
 270   while ($CmpdString = ReadCmpdString($SDFileRef)) {
 271     $CmpdCount++;
 272     @CmpdLines = split "\n", $CmpdString;
 273     # Process compound data header labels and figure out which ones are present for
 274     # all the compounds...
 275     if (@DataFieldLabels) {
 276       my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines);
 277       my(%CmpdDataFieldLabelsMap) = ();
 278       # Setup a map for the current labels...
 279       for $Label (@CmpdDataFieldLabels) {
 280         $CmpdDataFieldLabelsMap{$Label} = "PresentInSome";
 281       }
 282       # Check the presence old labels for this compound; otherwise, mark 'em new...
 283       for $Label (@DataFieldLabels) {
 284         if (!$CmpdDataFieldLabelsMap{$Label}) {
 285           $DataFieldLabelsMap{$Label} = "PresentInSome";
 286         }
 287       }
 288       # Check the presence this compound in the old labels; otherwise, add 'em...
 289       for $Label (@CmpdDataFieldLabels ) {
 290         if (!$DataFieldLabelsMap{$Label}) {
 291           # It's a new label...
 292           push @DataFieldLabels, $Label;
 293           $DataFieldLabelsMap{$Label} = "PresentInSome";
 294         }
 295       }
 296     }
 297     else {
 298       # Get the initial label set and set up a map...
 299       @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 300       for $Label (@DataFieldLabels) {
 301         $DataFieldLabelsMap{$Label} = "PresentInAll";
 302       }
 303     }
 304   }
 305   # Identify the common data field labels...
 306   @CommonDataFieldLabels = ();
 307   for $Label (@DataFieldLabels) {
 308     if ($DataFieldLabelsMap{$Label} eq "PresentInAll") {
 309       push @CommonDataFieldLabels, $Label;
 310     }
 311   }
 312   return ($CmpdCount, \@DataFieldLabels, \@CommonDataFieldLabels);
 313 }
 314 
 315 # Parse all the data header labels and return 'em as an list...
 316 #
 317 # Format:
 318 #
 319 #> Data header line
 320 #Data line(s)
 321 #Blank line
 322 #
 323 # [Data Header] (one line) precedes each item of data, starts with a greater than (>) sign, and
 324 # contains at least one of the following:
 325 #  The field name enclosed in angle brackets. For example: <melting.point>
 326 #  The field number, DTn , where n represents the number assigned to the field in a MACCS-II database
 327 #
 328 #Optional information for the data header includes:
 329 #  The compound’s external and internal registry numbers. External registry numbers must be enclosed in parentheses.
 330 #  Any combination of information
 331 #
 332 #The following are examples of valid data headers:
 333 #> <MELTING.POINT>
 334 #> 55 (MD-08974) <BOILING.POINT> DT12
 335 #> DT12 55
 336 #> (MD-0894) <BOILING.POINT> FROM ARCHIVES
 337 #
 338 #Notes: Sometimes last blank line is missing and can be just followed by $$$$
 339 #
 340 sub GetCmpdDataHeaderLabels {
 341   my($CmpdLines) = @_;
 342   my($CmpdLine, $Label, @Labels);
 343 
 344   @Labels = ();
 345   CMPDLINE: for $CmpdLine (@$CmpdLines) {
 346     if ($CmpdLine !~ /^>/) {
 347       next CMPDLINE;
 348     }
 349     # Does the line contains field name enclosed in angular brackets?
 350     ($Label) = $CmpdLine =~ /<.*?>/g;
 351     if (!defined($Label)) {
 352       next CMPDLINE;
 353     }
 354     $Label =~ s/(<|>)//g;
 355     push @Labels, $Label;
 356   }
 357   return (@Labels);
 358 }
 359 
 360 # Parse all the data header labels and values
 361 sub GetCmpdDataHeaderLabelsAndValues {
 362   my($CmpdLines) = @_;
 363   my($CmpdLine, $CurrentLabel, $Label, $Value, $ValueCount, $ProcessingLabelData, @Values, %DataFields);
 364 
 365   %DataFields = ();
 366   $ProcessingLabelData = 0;
 367   $ValueCount = 0;
 368   CMPDLINE: for $CmpdLine (@$CmpdLines) {
 369     if ($CmpdLine =~ /^\$\$\$\$/) {
 370       last CMPDLINE;
 371     }
 372     if ($CmpdLine =~ /^>/) {
 373       # Does the line contains field name enclosed in angular brackets?
 374       ($Label) = $CmpdLine =~ /<.*?>/g;
 375       if (defined $Label) {
 376         $CurrentLabel = $Label;
 377         $CurrentLabel =~ s/(<|>)//g;
 378         $ProcessingLabelData = 0;
 379         $ValueCount = 0;
 380 
 381         if ($CurrentLabel) {
 382           $ProcessingLabelData = 1;
 383           $DataFields{$CurrentLabel} = '';
 384           next CMPDLINE;
 385         }
 386       }
 387       else {
 388         if (!$ProcessingLabelData) {
 389           # Data line containing no <label> as allowed by SDF format. Just ignore it...
 390           next CMPDLINE;
 391         }
 392       }
 393     }
 394     if (!$ProcessingLabelData) {
 395       next CMPDLINE;
 396     }
 397     if (!(defined($CmpdLine) && length($CmpdLine))) {
 398       # Blank line terminates value for a label...
 399       $CurrentLabel = '';
 400       $ValueCount = 0;
 401       $ProcessingLabelData = 0;
 402       next CMPDLINE;
 403     }
 404     $ValueCount++;
 405     $Value = $CmpdLine;
 406 
 407     if ($ValueCount > 1) {
 408       $DataFields{$CurrentLabel} .= "\n" . $Value;
 409     }
 410     else {
 411       $DataFields{$CurrentLabel} = $Value;
 412     }
 413   }
 414   return (%DataFields);
 415 }
 416 
 417 # Return an updated compoud string after removing  data header label along with its
 418 # value from the specified compound string...
 419 #
 420 sub RemoveCmpdDataHeaderLabelAndValue {
 421   my($CmpdString, $DataHeaderLabel) = @_;
 422   my($Line, $PorcessingDataHeaderLabel, @CmpdLines);
 423 
 424   @CmpdLines = ();
 425   $PorcessingDataHeaderLabel = 0;
 426 
 427   CMPDLINE: for $Line (split "\n", $CmpdString) {
 428     if ($Line =~ /^>/ && $Line =~ /<$DataHeaderLabel>/i) {
 429       $PorcessingDataHeaderLabel = 1;
 430       next CMPDLINE;
 431     }
 432 
 433     if ($PorcessingDataHeaderLabel) {
 434       # Blank line indicates end of fingerprints data value...
 435       if ($Line =~ /^\$\$\$\$/) {
 436         push @CmpdLines, $Line;
 437         $PorcessingDataHeaderLabel = 0;
 438       }
 439       elsif (!length($Line)) {
 440         $PorcessingDataHeaderLabel = 0;
 441       }
 442       next CMPDLINE;
 443     }
 444 
 445     # Track compound lines without fingerprints data...
 446     push @CmpdLines, $Line;
 447   }
 448 
 449   return join "\n", @CmpdLines;
 450 }
 451 
 452 #
 453 # Using bond blocks, figure out the number of disconnected fragments  and
 454 # return their values along with the atom numbers in a string delimited by new
 455 # line character.
 456 #
 457 sub GetCmpdFragments {
 458   my($CmpdLines) = @_;
 459   my($AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, @AtomConnections, $BondType, $FragmentString, $FragmentCount, $LineIndex, $Index, $AtomNum, $NbrAtomNum, @ProcessedAtoms, $ProcessedAtomCount, $ProcessAtomNum, @ProcessingAtoms, @ConnectedAtoms, %Fragments, $FragmentNum, $AFragmentString);
 460 
 461   # Setup the connection table for each atom...
 462   @AtomConnections = ();
 463   ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
 464   for $AtomNum (1 .. $AtomCount) {
 465     %{$AtomConnections[$AtomNum]} = ();
 466   }
 467   for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
 468     ($FirstAtomNum, $SecondAtomNum, $BondType) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
 469     if (!$AtomConnections[$FirstAtomNum]{$SecondAtomNum}) {
 470       $AtomConnections[$FirstAtomNum]{$SecondAtomNum} = $BondType;
 471     }
 472     if (!$AtomConnections[$SecondAtomNum]{$FirstAtomNum}) {
 473       $AtomConnections[$SecondAtomNum]{$FirstAtomNum} = $BondType;
 474     }
 475   }
 476 
 477   #Get set to count fragments...
 478   $ProcessedAtomCount = 0;
 479   $FragmentNum = 0;
 480   %Fragments = ();
 481   @ProcessedAtoms = ();
 482   for $AtomNum (1 .. $AtomCount) {
 483     $ProcessedAtoms[$AtomNum] = 0;
 484   }
 485   while ($ProcessedAtomCount < $AtomCount) {
 486     @ProcessingAtoms = ();
 487     @ConnectedAtoms = ();
 488     ATOMNUM: for $AtomNum (1 .. $AtomCount) {
 489       if (!$ProcessedAtoms[$AtomNum]) {
 490         $ProcessedAtomCount++;
 491         $ProcessedAtoms[$AtomNum] = 1;
 492         push @ProcessingAtoms, $AtomNum;
 493         $FragmentNum++;
 494         @{$Fragments{$FragmentNum} } = ();
 495         push @{$Fragments{$FragmentNum} }, $AtomNum;
 496         last ATOMNUM;
 497       }
 498     }
 499 
 500     # Go over the neighbors and follow the connection trail while collecting the
 501     # atoms numbers present in the connected fragment...
 502     while (@ProcessingAtoms) {
 503       for ($Index = 0; $Index < @ProcessingAtoms; $Index++) {
 504         $ProcessAtomNum = $ProcessingAtoms[$Index];
 505         for $NbrAtomNum (keys %{$AtomConnections[$ProcessAtomNum]})  {
 506           if (!$ProcessedAtoms[$NbrAtomNum]) {
 507             $ProcessedAtomCount++;
 508             $ProcessedAtoms[$NbrAtomNum] = 1;
 509             push @ConnectedAtoms, $NbrAtomNum;
 510             push @{ $Fragments{$FragmentNum} }, $NbrAtomNum;
 511           }
 512         }
 513       }
 514       @ProcessingAtoms = ();
 515       @ProcessingAtoms = @ConnectedAtoms;
 516       @ConnectedAtoms = ();
 517     }
 518   }
 519   $FragmentCount = $FragmentNum;
 520   $FragmentString = "";
 521 
 522   # Sort out the fragments by size...
 523   for $FragmentNum (sort { @{$Fragments{$b}} <=> @{$Fragments{$a}}  } keys %Fragments ) {
 524     # Sort the atoms in a fragment by their numbers...
 525     $AFragmentString = join " ", sort { $a <=> $b } @{ $Fragments{$FragmentNum} };
 526     if ($FragmentString) {
 527       $FragmentString .=  "\n" . $AFragmentString;
 528     }
 529     else {
 530       $FragmentString = $AFragmentString;
 531     }
 532   }
 533   return ($FragmentCount, $FragmentString);
 534 }
 535 
 536 # Count number of lines present in between 4th and line containg "M END"
 537 sub GetCtabLinesCount {
 538   my($CmpdLines) = @_;
 539   my($LineIndex, $CtabLinesCount);
 540 
 541   $CtabLinesCount = 0;
 542  LINE: for ($LineIndex = 4; $LineIndex < @$CmpdLines; $LineIndex++) {
 543     #
 544     # Any line after atom and bond data starting with anything other than space or
 545     # a digit indicates end of Ctab atom/bond data block...
 546     #
 547     if (@$CmpdLines[$LineIndex] !~ /^[0-9 ]/) {
 548       $CtabLinesCount = $LineIndex - 4;
 549       last LINE;
 550     }
 551   }
 552   return $CtabLinesCount;
 553 }
 554 
 555 # Using atom blocks, count the number of atoms which contain special element
 556 # symbols not present in the periodic table.
 557 sub GetUnknownAtoms {
 558   my($CmpdLines) = @_;
 559   my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines, $LineIndex, $AtomCount, $AtomSymbol);
 560 
 561   $UnknownAtomCount = 0;
 562   $UnknownAtoms = "";
 563   $UnknownAtomLines = "";
 564   ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
 565   for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
 566     ($AtomSymbol) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]);
 567     if (!IsElement($AtomSymbol)) {
 568       $UnknownAtomCount++;
 569       $UnknownAtoms .= " $AtomSymbol";
 570       if ($UnknownAtomLines) {
 571         $UnknownAtomLines .= "\n" . @$CmpdLines[$LineIndex];
 572       }
 573       else {
 574         $UnknownAtomLines = @$CmpdLines[$LineIndex];
 575       }
 576     }
 577   }
 578   return ($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines);
 579 }
 580 
 581 # Check z coordinates of all atoms to see whether any of them is non-zero
 582 # which makes the compound geometry three dimensional...
 583 #
 584 sub IsCmpd3D {
 585   my($CmpdLines) = @_;
 586   my($LineIndex, $AtomCount, $AtomSymbol, $AtomX, $AtomY, $AtomZ);
 587 
 588   ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
 589   for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
 590     ($AtomSymbol, $AtomX, $AtomY, $AtomZ) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]);
 591     if ($AtomZ != 0) {
 592       return 1;
 593     }
 594   }
 595   return 0;
 596 }
 597 
 598 # Check whether it's a 2D compound...
 599 #
 600 sub IsCmpd2D {
 601   my($CmpdLines) = @_;
 602 
 603   return IsCmpd3D($CmpdLines) ? 0 : 1;
 604 }
 605 
 606 # Using bond blocks, count the number of bond lines which contain atom numbers
 607 # greater than atom count specified in compound count line...
 608 #
 609 sub GetInvalidAtomNumbers {
 610   my($CmpdLines) = @_;
 611   my($LineIndex, $AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, $InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines, $Line, $InvalidAtomPropertyLine, $ValuePairIndex, $AtomNum, $Value, @ValuePairs);
 612 
 613   ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
 614 
 615   $InvalidAtomNumbersCount = 0;
 616   $InvalidAtomNumbers = "";
 617   $InvalidAtomNumberLines = "";
 618 
 619   # Go over bond block lines...
 620   LINE: for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
 621     ($FirstAtomNum, $SecondAtomNum) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
 622     if ($FirstAtomNum <= $AtomCount && $SecondAtomNum <= $AtomCount) {
 623       next LINE;
 624     }
 625     if ($FirstAtomNum > $AtomCount) {
 626       $InvalidAtomNumbersCount++;
 627       $InvalidAtomNumbers .= " $FirstAtomNum";
 628     }
 629     if ($SecondAtomNum > $AtomCount) {
 630       $InvalidAtomNumbersCount++;
 631       $InvalidAtomNumbers .= " $SecondAtomNum";
 632     }
 633     if ($InvalidAtomNumberLines) {
 634       $InvalidAtomNumberLines .= "\n" . @$CmpdLines[$LineIndex];
 635     }
 636     else {
 637       $InvalidAtomNumberLines = @$CmpdLines[$LineIndex];
 638     }
 639   }
 640   # Go over property lines before M  END...
 641   #
 642   LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) {
 643     $Line = @$CmpdLines[$LineIndex];
 644     @ValuePairs = ();
 645     if ($Line =~ /^M  END/i) {
 646       last LINE;
 647     }
 648     @ValuePairs = ();
 649     if ($Line =~ /^M  CHG/i) {
 650       @ValuePairs = ParseCmpdChargePropertyLine($Line);
 651     }
 652     elsif ($Line =~ /^M  RAD/i) {
 653       @ValuePairs = ParseCmpdRadicalPropertyLine($Line);
 654     }
 655     elsif ($Line =~ /^M  ISO/i) {
 656       @ValuePairs = ParseCmpdIsotopePropertyLine($Line);
 657     }
 658     elsif ($Line =~ /^A  /i) {
 659       my($NextLine);
 660       $LineIndex++;
 661       $NextLine = @$CmpdLines[$LineIndex];
 662       @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine);
 663     }
 664     else {
 665       next LINE;
 666     }
 667 
 668     $InvalidAtomPropertyLine = 0;
 669     for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) {
 670       $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1];
 671       if ($AtomNum > $AtomCount) {
 672         $InvalidAtomPropertyLine = 1;
 673         $InvalidAtomNumbersCount++;
 674         $InvalidAtomNumbers .= " $AtomNum";
 675       }
 676     }
 677     if ($InvalidAtomPropertyLine) {
 678       if ($InvalidAtomNumberLines) {
 679         $InvalidAtomNumberLines .= "\n" . $Line;
 680       }
 681       else {
 682         $InvalidAtomNumberLines = $Line;
 683       }
 684     }
 685   }
 686 
 687   return ($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines);
 688 }
 689 
 690 # Ctab lines: Atom block
 691 #
 692 # Format: xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee
 693 #         A10       A10       A10       xA3 A2A3 A3 A3 A3 A3 A3 A3 A3 A3 A3 A3
 694 # x,y,z: Atom coordinates
 695 # aaa: Atom symbol. Entry in periodic table or L for atom list, A, Q, * for unspecified
 696 #      atom, and LP for lone pair, or R# for Rgroup label
 697 # dd: Mass difference. -3, -2, -1, 0, 1, 2, 3, 4 (0 for value beyond these limits)
 698 # ccc: Charge. 0 = uncharged or value other than these, 1 = +3, 2 = +2, 3 = +1,
 699 #      4 = doublet radical, 5 = -1, 6 = -2, 7 = -3
 700 # sss: Atom stereo parity. 0 = not stereo, 1 = odd, 2 = even, 3 = either or unmarked stereo center
 701 # hhh: Hydrogen count + 1. 1 = H0, 2 = H1, 3 = H2, 4 = H3, 5 = H4
 702 # bbb: Stereo care box. 0 = ignore stereo configuration of this double bond atom, 1 = stereo
 703 #      configuration of double bond atom must match
 704 # vvv: Valence. 0 = no marking (default)(1 to 14) = (1 to 14) 15 = zero valence
 705 # HHH: H0 designator. 0 = not specified, 1 = no H atoms allowed (redundant due to hhh)
 706 # rrr: Not used
 707 # iii: Not used
 708 # mmm: Atom-atom mapping number. 1 - number of atoms
 709 # nnn: Inversion/retention flag. 0 = property not applied, 1 = configuration is inverted,
 710 #      2 = configuration is retained.
 711 # eee: Exact change flag. 0 = property not applied, 1 = change on atom must be
 712 #      exactly as shown
 713 #
 714 # Notes:
 715 #  . StereoParity: 1 - ClockwiseStereo, 2 - AntiClockwiseStereo; 3 - Either; 0 - none. These
 716 #    values determine chirailty around the chiral center; a non zero value indicates atom
 717 #    has been marked as chiral center.
 718 #
 719 sub ParseCmpdAtomLine {
 720   my($Line) = @_;
 721   my ($LineIndex, $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity);
 722 
 723   ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = ('') x 7;
 724   if (length($Line) > 31) {
 725     ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = unpack("A10A10A10xA3A2A3A3", $Line);
 726   }
 727   else {
 728     ($AtomX, $AtomY, $AtomZ, $AtomSymbol) = unpack("A10A10A10", $Line);
 729   }
 730   return ($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity);
 731 }
 732 
 733 # Map MDL charge value used in SD and MOL files to internal charge used by MayaChemTools.
 734 #
 735 sub MDLChargeToInternalCharge {
 736   my($MDLCharge) = @_;
 737   my($InternalCharge);
 738 
 739   CHARGE: {
 740     if ($MDLCharge == 0) { $InternalCharge = 0; last CHARGE;}
 741     if ($MDLCharge == 1) { $InternalCharge = 3; last CHARGE;}
 742     if ($MDLCharge == 2) { $InternalCharge = 2; last CHARGE;}
 743     if ($MDLCharge == 3) { $InternalCharge = 1; last CHARGE;}
 744     if ($MDLCharge == 5) { $InternalCharge = -1; last CHARGE;}
 745     if ($MDLCharge == 6) { $InternalCharge = -2; last CHARGE;}
 746     if ($MDLCharge == 7) { $InternalCharge = -3; last CHARGE;}
 747     # All other MDL charge values, including 4 corresponding to "doublet radical",
 748     # are assigned internal value of 0.
 749     $InternalCharge = 0;
 750     if ($MDLCharge != 4) {
 751       carp "Warning: MDLChargeToInternalCharge: MDL charge value, $MDLCharge, is not supported: An internal charge value, 0, has been assigned...";
 752     }
 753   }
 754   return $InternalCharge;
 755 }
 756 
 757 # Map internal charge used by MayaChemTools to MDL charge value used in SD and MOL files.
 758 #
 759 sub InternalChargeToMDLCharge {
 760   my($InternalCharge) = @_;
 761   my($MDLCharge);
 762 
 763   CHARGE: {
 764     if ($InternalCharge == 3) { $MDLCharge = 1; last CHARGE;}
 765     if ($InternalCharge == 2) { $MDLCharge = 2; last CHARGE;}
 766     if ($InternalCharge == 1) { $MDLCharge = 3; last CHARGE;}
 767     if ($InternalCharge == -1) { $MDLCharge = 5; last CHARGE;}
 768     if ($InternalCharge == -2) { $MDLCharge = 6; last CHARGE;}
 769     if ($InternalCharge == -3) { $MDLCharge = 7; last CHARGE;}
 770     # All other MDL charge values, including 4 corresponding to "doublet radical",
 771     # are assigned internal value of 0.
 772     $MDLCharge = 0;
 773   }
 774   return $MDLCharge;
 775 }
 776 
 777 # Ctab lines: Bond block
 778 #
 779 # Format: 111222tttsssxxxrrrccc
 780 #
 781 # 111: First atom number.
 782 # 222: Second atom number.
 783 # ttt: Bond type. 1 = Single, 2 = Double, 3 = Triple, 4 = Aromatic, 5 = Single or Double,
 784 #      6 = Single or Aromatic, 7 = Double or Aromatic, 8 = Any
 785 # sss: Bond stereo. Single bonds: 0 = not stereo, 1 = Up, 4 = Either, 6 = Down,
 786 #      Double bonds: 0 = Use x-, y-, z-coords from atom block to determine cis or trans,
 787 #      3 = Cis or trans (either) double bond
 788 # xxx: Not used
 789 # rrr: Bond topology. 0 = Either, 1 = Ring, 2 = Chain
 790 # ccc: Reacting center status. 0 = unmarked, 1 = a center, -1 = not a center,
 791 #      Additional: 2 = no change,4 = bond made/broken, 8 = bond order changes 12 = 4+8
 792 #      (both made/broken and changes); 5 = (4 + 1), 9 = (8 + 1), and 13 = (12 + 1) are also possible
 793 #
 794 sub ParseCmpdBondLine {
 795   my($Line) = @_;
 796   my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo);
 797 
 798   ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = map {s/ //g; $_} unpack("A3A3A3A3", $Line);
 799   return ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo);
 800 }
 801 
 802 # Map MDL bond type value used in SD and MOL files to internal bond order  and bond types
 803 # values used by MayaChemTools...
 804 #
 805 sub MDLBondTypeToInternalBondOrder {
 806   my($MDLBondType) = @_;
 807   my($InternalBondOrder, $InternalBondType);
 808 
 809   $InternalBondType = '';
 810 
 811   BONDTYPE: {
 812     if ($MDLBondType == 1) { $InternalBondOrder = 1; $InternalBondType = 'Single'; last BONDTYPE;}
 813     if ($MDLBondType == 2) { $InternalBondOrder = 2; $InternalBondType = 'Double'; last BONDTYPE;}
 814     if ($MDLBondType == 3) { $InternalBondOrder = 3; $InternalBondType = 'Triple'; last BONDTYPE;}
 815     if ($MDLBondType == 4) { $InternalBondOrder = 1.5; $InternalBondType = 'Aromatic'; last BONDTYPE;} # Aromatic
 816     if ($MDLBondType == 5) { $InternalBondOrder = 1; $InternalBondType = 'SingleOrDouble'; last BONDTYPE;} # Aromatic
 817     if ($MDLBondType == 6) { $InternalBondOrder = 1; $InternalBondType = 'SingleOrAromatic'; last BONDTYPE;} # Aromatic
 818     if ($MDLBondType == 7) { $InternalBondOrder = 2; $InternalBondType = 'DoubleOrAromatic'; last BONDTYPE;} # Aromatic
 819     if ($MDLBondType == 8) { $InternalBondOrder = 1; $InternalBondType = 'Any'; last BONDTYPE;} # Aromatic
 820     #
 821     # Although MDL aromatic bond values are used for query only and explicit Kekule bond order
 822     # values must be assigned, internal value of 1.5 is allowed to indicate aromatic bond orders.
 823     #
 824     # All other MDL bond type values -  5 = Single or Double, 6 = Single or Aromatic, 7 = Double or Aromatic,
 825     # 8 = Any - are also assigned appropriate internal value of 1: These are meant to be used for
 826     # structure queries by MDL products.
 827     #
 828     $InternalBondOrder = 1;
 829     $InternalBondType = 'Single';
 830 
 831     carp "Warning: MDLBondTypeToInternalBondOrder: MDL bond type value, $MDLBondType, is not supported: An internal bond order value, 0, has been assigned...";
 832   }
 833   return ($InternalBondOrder, $InternalBondType);
 834 }
 835 
 836 # Map internal bond order  and bond type values used by MayaChemTools to MDL bond type value used
 837 # in SD and MOL files...
 838 #
 839 sub InternalBondOrderToMDLBondType {
 840   my($InternalBondOrder, $InternalBondType) = @_;
 841   my($MDLBondType);
 842 
 843   BONDTYPE: {
 844     if ($InternalBondOrder == 1) {
 845       if ($InternalBondType =~ /^SingleOrDouble$/i) {
 846         $MDLBondType = 5;
 847       }
 848       elsif ($InternalBondType =~ /^SingleOrAromatic$/i) {
 849         $MDLBondType = 6;
 850       }
 851       elsif ($InternalBondType =~ /^Any$/i) {
 852         $MDLBondType = 8;
 853       }
 854       else {
 855         $MDLBondType = 1;
 856       }
 857       $MDLBondType = 1;
 858       last BONDTYPE;
 859     }
 860     if ($InternalBondOrder == 2) {
 861       if ($InternalBondType =~ /^DoubleOrAromatic$/i) {
 862         $MDLBondType = 7;
 863       }
 864       else {
 865         $MDLBondType = 2;
 866       }
 867       last BONDTYPE;
 868     }
 869     if ($InternalBondOrder == 3) { $MDLBondType = 3; last BONDTYPE;}
 870     if ($InternalBondOrder == 1.5) { $MDLBondType = 4; last BONDTYPE;}
 871     if ($InternalBondType =~ /^Any$/i) { $MDLBondType = 8; last BONDTYPE;}
 872 
 873     $MDLBondType = 1;
 874 
 875     carp "Warning: InternalBondOrderToMDLBondType: Internal bond order and type values, $InternalBondOrder and $InternalBondType, don't match any valid MDL bond type: MDL bond type value, 1, has been assigned...";
 876   }
 877   return $MDLBondType;
 878 }
 879 
 880 # Third line: Comments - A blank line is also allowed.
 881 sub ParseCmpdCommentsLine {
 882   my($Line) = @_;
 883   my($Comments);
 884 
 885   $Comments = unpack("A80", $Line);
 886 
 887   return ($Comments);
 888 }
 889 
 890 # Map MDL bond stereo value used in SD and MOL files to internal bond stereochemistry values used by MayaChemTools...
 891 #
 892 sub MDLBondStereoToInternalBondStereochemistry {
 893   my($MDLBondStereo) = @_;
 894   my($InternalBondStereo);
 895 
 896   $InternalBondStereo = '';
 897 
 898   BONDSTEREO: {
 899     if ($MDLBondStereo == 1) { $InternalBondStereo = 'Up'; last BONDSTEREO;}
 900     if ($MDLBondStereo == 4) { $InternalBondStereo = 'UpOrDown'; last BONDSTEREO;}
 901     if ($MDLBondStereo == 6) { $InternalBondStereo = 'Down'; last BONDSTEREO;}
 902     if ($MDLBondStereo == 3) { $InternalBondStereo = 'CisOrTrans'; last BONDSTEREO;}
 903     if ($MDLBondStereo == 0) { $InternalBondStereo = 'None'; last BONDSTEREO;}
 904 
 905     $InternalBondStereo = '';
 906     carp "Warning: MDLBondStereoToInternalBondType: MDL bond stereo value, $MDLBondStereo, is not supported: It has been ignored and bond order would be used to determine bond type...";
 907   }
 908   return $InternalBondStereo;
 909 }
 910 
 911 # Map internal bond stereochemistry values used by MayaChemTools to MDL bond stereo value used in SD and MOL files...
 912 #
 913 sub InternalBondStereochemistryToMDLBondStereo {
 914   my($InternalBondStereo) = @_;
 915   my($MDLBondStereo);
 916 
 917   $MDLBondStereo = 0;
 918 
 919   BONDSTEREO: {
 920     if ($InternalBondStereo =~ /^Up$/i) { $MDLBondStereo = 1; last BONDSTEREO;}
 921     if ($InternalBondStereo =~ /^UpOrDown$/i) { $MDLBondStereo = 4; last BONDSTEREO;}
 922     if ($InternalBondStereo =~ /^Down$/) { $MDLBondStereo = 6; last BONDSTEREO;}
 923     if ($InternalBondStereo =~ /^CisOrTrans$/) { $MDLBondStereo = 3; last BONDSTEREO;}
 924 
 925     $MDLBondStereo = 0;
 926   }
 927   return $MDLBondStereo;
 928 }
 929 
 930 # Fourth line: Counts
 931 #
 932 # Format: aaabbblllfffcccsssxxxrrrpppiiimmmvvvvvv
 933 #
 934 # aaa: number of atoms; bbb: number of bonds; lll: number of atom lists; fff: (obsolete)
 935 # ccc: chiral flag: 0=not chiral, 1=chiral; sss: number of stext entries; xxx,rrr,ppp,iii:
 936 # (obsolete); mmm: number of lines of additional properties, including the M END line, No
 937 # longer supported, default is set to 999; vvvvvv: version
 938 
 939 sub ParseCmpdCountsLine {
 940   my($Line) = @_;
 941   my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version);
 942 
 943   if (length($Line) >= 39) {
 944     ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = unpack("A3A3x3x3A3x3x3x3x3x3A3A6", $Line);
 945   }
 946   elsif (length($Line) >= 15) {
 947     ($PropertyCount, $Version) = ("999", "v2000");
 948     ($AtomCount, $BondCount, $ChiralFlag) = unpack("A3A3x3x3A3", $Line);
 949   }
 950   else {
 951     ($ChiralFlag, $PropertyCount, $Version) = ("0", "999", "v2000");
 952     ($AtomCount, $BondCount) = unpack("A3A3", $Line);
 953   }
 954 
 955   if ($Version =~ /V3000/i) {
 956     # Current version of MayaChemTools modules and classes for processing MDL MOL and SD don't support
 957     # V3000. So instead of relying on callers, just exit with an error to disable any processing of V3000
 958     # format.
 959     croak "Error: SDFileUtil::ParseCmpdCountsLine: The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools...";
 960   }
 961 
 962   return ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version);
 963 }
 964 
 965 # Second line: Misc info
 966 #
 967 # Format: IIPPPPPPPPMMDDYYHHmmddSSssssssssssEEEEEEEEEEEERRRRRR
 968 #         A2A8      A10       A2I2A10       A12         A6
 969 # User's first and last initials (I), program name (P), date/time (M/D/Y,H:m),
 970 # dimensional codes - 2D or 3D (d),scaling factors (S, s), energy (E) if modeling program input,
 971 # internal registry number (R) if input through MDL form. A blank line is also allowed.
 972 sub ParseCmpdMiscInfoLine {
 973   my($Line) = @_;
 974   my($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum);
 975 
 976   ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum) = unpack("A2A8A10A2A2A10A12A6", $Line);
 977   return ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum);
 978 }
 979 
 980 # First line: Molecule name. This line is unformatted, but like all other lines in a
 981 # molfile may not extend beyond column 80. A blank line is also allowed.
 982 sub ParseCmpdMolNameLine {
 983   my($Line) = @_;
 984   my($MolName);
 985 
 986   $MolName = unpack("A80", $Line);
 987 
 988   return ($MolName);
 989 }
 990 
 991 # Parse atom alias property line in CTAB generic properties block.
 992 #
 993 # Atom alias property line format:
 994 #
 995 # A  aaa
 996 # x...
 997 #
 998 #    aaa: Atom number
 999 #    x: Atom alias in next line
1000 #
1001 sub ParseCmpdAtomAliasPropertyLine {
1002   my($Line, $NextLine) = @_;
1003   my($Label, $AtomNumber, $AtomAlias);
1004 
1005   ($Label, $AtomNumber) = split(' ', $Line);
1006   $AtomAlias = $NextLine;
1007 
1008   if (!$AtomAlias) {
1009     carp "Warning: _ParseCmpdAtomAliasPropertyLine: No atom alias value specified on the line following atom alias property line...";
1010   }
1011 
1012   return ($AtomNumber, $AtomAlias);
1013 }
1014 
1015 # Parse charge property line in CTAB generic properties block.
1016 #
1017 # Charge property line format:
1018 #
1019 # M  CHGnn8 aaa vvv ...
1020 #
1021 #    nn8: Number of value pairs. Maximum of 8 pairs allowed.
1022 #    aaa: Atom number
1023 #    vvv: -15 to +15. Default of 0 = uncharged atom. When present, this property supersedes
1024 #    all charge and radical values in the atom block, forcing a 0 charge on all atoms not
1025 #    listed in an M  CHG or M  RAD line.
1026 #
1027 sub ParseCmpdChargePropertyLine {
1028   my($Line) = @_;
1029 
1030   return _ParseCmpdGenericPropertyLine('Charge', $Line);
1031 }
1032 
1033 
1034 # Parse isotope property line in CTAB generic properties block.
1035 #
1036 # Isoptope property line format:
1037 #
1038 # M  ISOnn8 aaa vvv ...
1039 #
1040 #    nn8: Number of value paris. Maximum of 8 pairs allowed.
1041 #    aaa: Atom number
1042 #    vvv: Absolute mass of the atom isotope as a positive integer. When present, this property
1043 #    supersedes all isotope values in the atom block. Default (no entry) means natural
1044 #    abundance. The difference between this absolute mass value and the natural
1045 #    abundance value specified in the PTABLE.DAT file must be within the range of -18
1046 #    to +12
1047 #
1048 # Notes:
1049 #  . Values correspond to mass numbers...
1050 #
1051 sub ParseCmpdIsotopePropertyLine {
1052   my($Line) = @_;
1053 
1054   return _ParseCmpdGenericPropertyLine('Isotope', $Line);
1055 }
1056 
1057 # Parse radical property line in CTAB generic properties block.
1058 #
1059 # Radical property line format:
1060 #
1061 # M  RADnn8 aaa vvv ...
1062 #
1063 #    nn8: Number of value paris. Maximum of 8 pairs allowed.
1064 #    aaa: Atom number
1065 #    vvv: Default of 0 = no radical, 1 = singlet, 2 = doublet, 3 = triplet . When
1066 #    present, this property supersedes all charge and radical values in the atom block,
1067 #    forcing a 0 (zero) charge and radical on all atoms not listed in an M  CHG or
1068 #    M  RAD line.
1069 #
1070 sub ParseCmpdRadicalPropertyLine {
1071   my($Line) = @_;
1072 
1073   return _ParseCmpdGenericPropertyLine('Radical', $Line);
1074 }
1075 
1076 # Map MDL radical stereo value used in SD and MOL files to internal spin multiplicity values used by MayaChemTools...
1077 #
1078 sub MDLRadicalToInternalSpinMultiplicity {
1079   my($MDLRadical) = @_;
1080   my($InternalSpinMultiplicity);
1081 
1082   $InternalSpinMultiplicity = '';
1083 
1084   SPINMULTIPLICITY: {
1085     if ($MDLRadical == 0) { $InternalSpinMultiplicity = 0; last SPINMULTIPLICITY;}
1086     if ($MDLRadical == 1) { $InternalSpinMultiplicity = 1; last SPINMULTIPLICITY;}
1087     if ($MDLRadical == 2) { $InternalSpinMultiplicity = 2; last SPINMULTIPLICITY;}
1088     if ($MDLRadical == 3) { $InternalSpinMultiplicity = 3; last SPINMULTIPLICITY;}
1089     $InternalSpinMultiplicity = '';
1090     carp "Warning: MDLRadicalToInternalSpinMultiplicity: MDL radical value, $MDLRadical, specifed on line M  RAD is not supported...";
1091   }
1092   return $InternalSpinMultiplicity;
1093 }
1094 
1095 # Map internal spin multiplicity values used by MayaChemTools to MDL radical stereo value used in SD and MOL files...
1096 #
1097 sub InternalSpinMultiplicityToMDLRadical {
1098   my($InternalSpinMultiplicity) = @_;
1099   my($MDLRadical);
1100 
1101   $MDLRadical = 0;
1102 
1103   SPINMULTIPLICITY: {
1104     if ($InternalSpinMultiplicity == 1) { $MDLRadical = 1; last SPINMULTIPLICITY;}
1105     if ($InternalSpinMultiplicity == 2) { $MDLRadical = 2; last SPINMULTIPLICITY;}
1106     if ($InternalSpinMultiplicity == 3) { $MDLRadical = 3; last SPINMULTIPLICITY;}
1107     $MDLRadical = 0;
1108   }
1109   return $MDLRadical;
1110 }
1111 
1112 # Process generic CTAB property line...
1113 sub _ParseCmpdGenericPropertyLine {
1114   my($PropertyName, $Line) = @_;
1115 
1116   my($Label, $PropertyLabel, $ValuesCount, $ValuePairsCount, @ValuePairs);
1117 
1118   @ValuePairs = ();
1119   ($Label, $PropertyLabel, $ValuesCount, @ValuePairs) = split(' ', $Line);
1120   $ValuePairsCount = (scalar @ValuePairs)/2;
1121   if ($ValuesCount != $ValuePairsCount) {
1122     carp "Warning: _ParseCmpdGenericPropertyLine: Number of atom number and $PropertyName value paris specified on $Label $PropertyLabel property line, $ValuePairsCount, does not match expected value of $ValuesCount...";
1123   }
1124 
1125   return (@ValuePairs);
1126 }
1127 
1128 # Generic CTAB property lines for charge, istope and radical properties...
1129 #
1130 sub _GenerateCmpdGenericPropertyLines {
1131   my($PropertyName, $PropertyValuePairsRef) = @_;
1132   my($Index, $PropertyLabel, $Line, $PropertyCount, $AtomNum, $PropertyValue, @PropertyLines);
1133 
1134   @PropertyLines = ();
1135   NAME: {
1136     if ($PropertyName =~ /^Charge$/i) { $PropertyLabel = "M  CHG"; last NAME; }
1137     if ($PropertyName =~ /^Isotope$/i) { $PropertyLabel = "M  ISO"; last NAME; }
1138     if ($PropertyName =~ /^Radical$/i) { $PropertyLabel = "M  RAD"; last NAME; }
1139     carp "Warning: _GenerateCmpdGenericPropertyLines: Unknown property name, $PropertyName, specified...";
1140     return @PropertyLines;
1141   }
1142 
1143   # A maximum of 8 property pair values allowed per line...
1144   $PropertyCount = 0;
1145   $Line = '';
1146   for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) {
1147     if ($PropertyCount > 8) {
1148       # Setup property line...
1149       $Line = "${PropertyLabel}  8${Line}";
1150       push @PropertyLines, $Line;
1151 
1152       $PropertyCount = 0;
1153       $Line = '';
1154     }
1155     $PropertyCount++;
1156     $AtomNum = $PropertyValuePairsRef->[$Index];
1157     $PropertyValue = $PropertyValuePairsRef->[$Index + 1];
1158     $Line .= sprintf " %3i %3i", $AtomNum, $PropertyValue;
1159   }
1160   if ($Line) {
1161     $Line = "${PropertyLabel}  ${PropertyCount}${Line}";
1162     push @PropertyLines, $Line;
1163   }
1164   return @PropertyLines;
1165 }
1166 
1167 #
1168 # Read compound data into a string and return its value
1169 sub ReadCmpdString {
1170   my($SDFileRef) = @_;
1171   my($CmpdString);
1172 
1173   $CmpdString = "";
1174   LINE: while (defined($_ = <$SDFileRef>)) {
1175     # Change Windows and Mac new line char to UNIX...
1176     s/(\r\n)|(\r)/\n/g;
1177 
1178     if (/^\$\$\$\$/) {
1179       # Take out any new line char at the end by explicitly removing it instead of using
1180       # chomp, which might not always work correctly on files generated on a system
1181       # with a value of input line separator different from the current system...
1182       s/\n$//g;
1183 
1184       # Doesn't hurt to chomp...
1185       chomp;
1186 
1187       $CmpdString .=  $_;
1188       last LINE;
1189     }
1190     else {
1191       $CmpdString .=  $_;
1192     }
1193   }
1194   return $CmpdString;
1195 }
1196 
1197 # Find out the number of fragements in the compounds. And for the compound with
1198 # more than one fragment, remove all the others besides the largest one.
1199 sub WashCmpd {
1200   my($CmpdLines) = @_;
1201   my($WashedCmpdString, $FragmentCount, $Fragments);
1202 
1203   $WashedCmpdString = "";
1204   ($FragmentCount, $Fragments) = GetCmpdFragments($CmpdLines);
1205   if ($FragmentCount > 1) {
1206     # Go over the compound data for the largest fragment including property
1207     # data...
1208     my (@AllFragments, @LargestFragment, %LargestFragmentAtoms, @WashedCmpdLines, $Index, $LineIndex, $AtomCount, $BondCount, $NewAtomCount, $NewBondCount, $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo, $FirstNewAtomNum, $SecondNewAtomNum, $AtomNum, $ChiralFlag, $BondLine, $MENDLineIndex, $Line, $Value, @ValuePairs, @NewValuePairs, $ValuePairIndex, $NewAtomNum, @NewPropertyLines);
1209 
1210     @AllFragments = (); @LargestFragment = ();
1211     %LargestFragmentAtoms = ();
1212     @AllFragments = split "\n", $Fragments;
1213     @LargestFragment = split " ", $AllFragments[0];
1214     for $Index (0 .. $#LargestFragment) {
1215       # Map old atom numbers to new atom numbers as the fragment atom numbers are sorted
1216       # from lowest to highest old atom numbers...
1217       $LargestFragmentAtoms{$LargestFragment[$Index]} = $Index + 1;
1218     }
1219     @WashedCmpdLines = ();
1220     push @WashedCmpdLines, @$CmpdLines[0], @$CmpdLines[1], @$CmpdLines[2], @$CmpdLines[3];
1221     ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine(@$CmpdLines[3]);
1222     $NewAtomCount = @LargestFragment;
1223     $NewBondCount = 0;
1224     $AtomNum = 0;
1225     # Retrieve the largest fragment atom lines...
1226     for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
1227       $AtomNum++;
1228       if ($LargestFragmentAtoms{$AtomNum}) {
1229         push @WashedCmpdLines, @$CmpdLines[$LineIndex];
1230       }
1231     }
1232     # Retrieve the largest fragment bond lines...
1233     for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
1234       ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
1235       if ($LargestFragmentAtoms{$FirstAtomNum} && $LargestFragmentAtoms{$SecondAtomNum}) {
1236         $NewBondCount++;
1237         # Set up bond line with new atom number mapping...
1238         $FirstNewAtomNum =  $LargestFragmentAtoms{$FirstAtomNum};
1239         $SecondNewAtomNum =  $LargestFragmentAtoms{$SecondAtomNum};
1240         $BondLine = GenerateCmpdBondLine($FirstNewAtomNum, $SecondNewAtomNum, $BondType, $BondStereo);
1241         push @WashedCmpdLines, $BondLine;
1242       }
1243     }
1244     # Get property lines for CHG, ISO and RAD label and map the old atom numbers to new
1245     # atom numners; Others, property lines before M  END line are skipped as atom numbers for
1246     # other properties might not valid anymore...
1247     #
1248     $MENDLineIndex = $LineIndex;
1249     LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) {
1250       $Line = @$CmpdLines[$LineIndex];
1251       if ($Line =~ /^M  END/i) {
1252         push @WashedCmpdLines, "M  END";
1253         $MENDLineIndex = $LineIndex;
1254         last LINE;
1255       }
1256 
1257       @ValuePairs = ();
1258       if ($Line =~ /^M  CHG/i) {
1259         @ValuePairs = ParseCmpdChargePropertyLine($Line);
1260       }
1261       elsif ($Line =~ /^M  RAD/i) {
1262         @ValuePairs = ParseCmpdRadicalPropertyLine($Line);
1263       }
1264       elsif ($Line =~ /^M  ISO/i) {
1265         @ValuePairs = ParseCmpdIsotopePropertyLine($Line);
1266       }
1267       elsif ($Line =~ /^A  /i) {
1268         my($NextLine);
1269         $LineIndex++;
1270         $NextLine = @$CmpdLines[$LineIndex];
1271         @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine);
1272       }
1273       else {
1274         next LINE;
1275       }
1276 
1277       if (!@ValuePairs) {
1278         next LINE;
1279       }
1280 
1281       # Collect values for valid atom numbers with mapping to new atom numbers...
1282       @NewValuePairs = ();
1283       VALUEINDEX: for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) {
1284         $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1];
1285         if (!exists $LargestFragmentAtoms{$AtomNum}) {
1286           next VALUEINDEX;
1287         }
1288         $NewAtomNum = $LargestFragmentAtoms{$AtomNum};
1289         push @NewValuePairs, ($NewAtomNum, $Value)
1290       }
1291       if (!@NewValuePairs) {
1292         next LINE;
1293       }
1294       @NewPropertyLines = ();
1295       if ($Line =~ /^M  CHG/i) {
1296         @NewPropertyLines = GenerateCmpdChargePropertyLines(\@NewValuePairs);
1297       }
1298       elsif ($Line =~ /^M  RAD/i) {
1299         @NewPropertyLines = GenerateCmpdRadicalPropertyLines(\@NewValuePairs);
1300       }
1301       elsif ($Line =~ /^M  ISO/i) {
1302         @NewPropertyLines = GenerateCmpdIsotopePropertyLines(\@NewValuePairs);
1303       }
1304       elsif ($Line =~ /^A  /i) {
1305         @NewPropertyLines = GenerateCmpdAtomAliasPropertyLines(\@NewValuePairs);
1306       }
1307       push @WashedCmpdLines, @NewPropertyLines;
1308     }
1309 
1310     # Retrieve rest of the data label and value property data...
1311     for ($LineIndex = (1 + $MENDLineIndex); $LineIndex < @$CmpdLines; $LineIndex++) {
1312       push @WashedCmpdLines, @$CmpdLines[$LineIndex];
1313     }
1314     # Update atom and bond count line...
1315     $WashedCmpdLines[3] = GenerateCmpdCountsLine($NewAtomCount, $NewBondCount, $ChiralFlag);
1316 
1317     $WashedCmpdString = join "\n", @WashedCmpdLines;
1318   }
1319   return ($FragmentCount, $Fragments, $WashedCmpdString);
1320 }
1321