MayaChemTools

   1 package MolecularFormula;
   2 #
   3 # File: MolecularFormula.pm
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use Carp;
  28 use Text::ParseWords;
  29 use TextUtil;
  30 use PeriodicTable;
  31 
  32 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  33 
  34 @ISA = qw(Exporter);
  35 @EXPORT = qw();
  36 @EXPORT_OK = qw(CalculateMolecularWeight CalculateExactMass CalculateElementalComposition FormatCompositionInfomation GetElementsAndCount IsMolecularFormula);
  37 
  38 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  39 
  40 #
  41 # Calculate molecular weight assuming its a valid molecular formula...
  42 #
  43 sub CalculateMolecularWeight {
  44   my($MolecularFormula) = @_;
  45   my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $FormulaElementsRef, $FormulaElementCountRef);
  46 
  47   ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
  48   if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) {
  49     return undef;
  50   }
  51 
  52   $MolecularWeight = 0;
  53 
  54   for $Index (0 .. $#{$FormulaElementsRef}) {
  55     $ElementSymbol = $FormulaElementsRef->[$Index];
  56     $ElementCount = $FormulaElementCountRef->[$Index];
  57     $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol);
  58     $MolecularWeight += $AtomicWeight * $ElementCount;
  59   }
  60   return $MolecularWeight;
  61 }
  62 
  63 #
  64 # Calculate exact mass assuming it's a valid formula...
  65 #
  66 sub CalculateExactMass {
  67   my($MolecularFormula) = @_;
  68   my($Index, $ElementSymbol, $ElementCount, $ExactMass, $RelativeAtomicMass, $FormulaElementsRef, $FormulaElementCountRef);
  69 
  70   ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
  71   if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) {
  72     return undef;
  73   }
  74   $ExactMass = 0;
  75 
  76   for $Index (0 .. $#{$FormulaElementsRef}) {
  77     $ElementSymbol = $FormulaElementsRef->[$Index];
  78     $ElementCount = $FormulaElementCountRef->[$Index];
  79     $RelativeAtomicMass = PeriodicTable::GetElementMostAbundantNaturalIsotopeMass($ElementSymbol);
  80     if (!defined($RelativeAtomicMass)) {
  81       next ELEMENT;
  82     }
  83     $ExactMass += $RelativeAtomicMass * $ElementCount;
  84   }
  85   return $ExactMass;
  86 }
  87 
  88 
  89 #
  90 # Calculate elemental composition and return reference to arrays
  91 # containing elements and their percent composition...
  92 #
  93 sub CalculateElementalComposition {
  94   my($MolecularFormula) = @_;
  95   my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $Composition, $CompositionMultiplier, $FormulaElementsRef, $FormulaElementCountRef, @FormulaElements, @FormulaElementComposition);
  96 
  97   $MolecularWeight = CalculateMolecularWeight($MolecularFormula);
  98   if (! defined $MolecularWeight) {
  99     return (undef, undef);
 100   }
 101   ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
 102 
 103   @FormulaElements = ();
 104   @FormulaElementComposition = ();
 105 
 106   if (!$MolecularWeight) {
 107     return ( \@FormulaElements, \@FormulaElementComposition);
 108   }
 109 
 110   $CompositionMultiplier = 100 / $MolecularWeight;
 111 
 112   for $Index (0 .. $#{$FormulaElementsRef}) {
 113     $ElementSymbol = $FormulaElementsRef->[$Index];
 114     $ElementCount = $FormulaElementCountRef->[$Index];
 115     $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol);
 116     $Composition = ($AtomicWeight * $ElementCount) * $CompositionMultiplier;
 117 
 118     push @FormulaElements, $ElementSymbol;
 119     push @FormulaElementComposition, $Composition;
 120   }
 121 
 122   return ( \@FormulaElements, \@FormulaElementComposition);
 123 }
 124 
 125 # Using refernece to element and its composition arrays, format composition information
 126 # as: Element: Composition;...
 127 #
 128 sub FormatCompositionInfomation {
 129   my($Index, $ElementSymbol, $ElementComposition, $ElementsRef, $ElementCompositionRef, $Precision, $Composition);
 130 
 131   $Precision = 2;
 132   if (@_ == 3) {
 133     ($ElementsRef, $ElementCompositionRef, $Precision) = @_;
 134   }
 135   else {
 136     ($ElementsRef, $ElementCompositionRef) = @_;
 137   }
 138 
 139   $Composition = '';
 140   for $Index (0 .. $#{$ElementsRef}) {
 141     $ElementSymbol = $ElementsRef->[$Index];
 142     $ElementComposition = $ElementCompositionRef->[$Index];
 143     $ElementComposition = sprintf("%.${Precision}f", $ElementComposition);
 144 
 145     $Composition .= ($Composition) ? '; ' : '';
 146     $Composition .=  "${ElementSymbol}: ${ElementComposition}%";
 147   }
 148 
 149   return $Composition;
 150 }
 151 
 152 #
 153 # Get elements and their count...
 154 #
 155 sub GetElementsAndCount {
 156   my($MolecularFormula) = @_;
 157   my($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg);
 158 
 159   ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula);
 160 
 161   return ($FormulaElementsRef, $FormulaElementCountRef);
 162 }
 163 
 164 #
 165 # Is it a valid molecular formula?
 166 #
 167 sub IsMolecularFormula {
 168   my($MolecularFormula, $PrintErrorMsg, $Status, $FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg);
 169 
 170   ($MolecularFormula) = @_;
 171 
 172   ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula);
 173   $Status = (defined($FormulaElementsRef) && defined($FormulaElementCountRef)) ? 1 : 0;
 174 
 175   return (wantarray ? ($Status, $ErrorMsg) : $Status);
 176 }
 177 
 178 #
 179 # Process molecular formula. For a valid formula, return references to arrays conatining elements
 180 # and element count; otherwsie, return undef.
 181 #
 182 sub _ProcessMolecularFormula {
 183   my($MolecularFormula) = @_;
 184   my($ErrorMsg) = '';
 185 
 186   $MolecularFormula = _CleanUpFormula($MolecularFormula);
 187 
 188   # Make sure it only contains numbers and letters...
 189   if ($MolecularFormula =~ /[^a-zA-Z0-9\(\)\[\]]/) {
 190     $ErrorMsg = 'Molecular formula contains characters other than a-zA-Z0-9';
 191     return (undef, undef, $ErrorMsg);
 192   }
 193 
 194   # Parse the formula...
 195   my($ElementSpec, $FormulaElementSpec, $Spec, $ElementSymbol, $ElementCount,  @FormulaElements, @ElementCount, %FormulaElementsToCountMap, @SubFormulaElements, %SubFormulaElementsToCountMap);
 196 
 197   @FormulaElements = (); @ElementCount = ();
 198   %FormulaElementsToCountMap = ();
 199 
 200 # Setup element symbol and count regular expression...
 201 # IUPAC: http://www.iupac.org/reports/provisional/abstract04/RB-prs310804/Chap4-3.04.pdf
 202 #
 203 
 204   $FormulaElementSpec = qr/
 205                                       \G(                         # $1
 206                                                   (?:
 207                                                       ([A-Z][a-z]?)   # Two or one letter element symbol; $2
 208                                                       ([0-9]*)          # Optionally followed by element count; $3
 209                                                   )
 210                                                   | \( | \[
 211                                                   | \)[0-9]* | \][0-9]*
 212                                                   | .
 213                                             )
 214                                       /x;
 215 
 216   my($ProcessingParenthesis);
 217   $ProcessingParenthesis = 0;
 218   # Go over the formula...
 219   FORMULA: while ($MolecularFormula =~ /$FormulaElementSpec/gx) {
 220     ($Spec, $ElementSymbol, $ElementCount) = ($1, $2, $3);
 221 
 222     # Handle parenthesis in formula to indicate repeating units...
 223     if ($Spec =~ /^(\(|\[)/) {
 224       if ($ProcessingParenthesis) {
 225         $ErrorMsg = "Molecular formula contains multiple level of () or []";
 226         return (undef, undef, $ErrorMsg);
 227       }
 228       $ProcessingParenthesis = 1;
 229       @SubFormulaElements = ();
 230       %SubFormulaElementsToCountMap = ();
 231       next FORMULA;
 232     }
 233     elsif ($Spec =~ /^(\)|\])/) {
 234       $ProcessingParenthesis = 0;
 235 
 236       # Retrieve repeat count and move data to @FormulaElements and %FormulaElementsToCountMap;
 237       my($RepeatCount, $Symbol, $Count);
 238       $RepeatCount = $Spec;
 239       $RepeatCount =~  s/(\)|\])//g;
 240       if (!$RepeatCount) {
 241         $RepeatCount = 1;
 242       }
 243       # Copy data...
 244       for $Symbol (@SubFormulaElements) {
 245         $Count = $SubFormulaElementsToCountMap{$Symbol} * $RepeatCount;
 246         _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $Symbol, $Count);
 247       }
 248 
 249       # Get ready again...
 250       @SubFormulaElements = ();
 251       %SubFormulaElementsToCountMap = ();
 252 
 253       next FORMULA;
 254     }
 255 
 256     # Retrieve element symbol and count...
 257     $ElementSymbol = ($Spec && !$ElementSymbol) ? $Spec : ($ElementSymbol ? $ElementSymbol : '');
 258     $ElementCount = $ElementCount ? $ElementCount : 1;
 259     if (!PeriodicTable::IsElement($ElementSymbol)) {
 260       $ErrorMsg = "Molecular formula contains unknown elemental symbol $ElementSymbol";
 261       return (undef, undef, $ErrorMsg);
 262     }
 263 
 264     if ($ProcessingParenthesis) {
 265       _SetupFormulaElementData(\@SubFormulaElements, \%SubFormulaElementsToCountMap, $ElementSymbol, $ElementCount);
 266     }
 267     else {
 268       _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $ElementSymbol, $ElementCount);
 269     }
 270   }
 271 
 272   # Setup element count array...
 273   for $ElementSymbol (@FormulaElements) {
 274     $ElementCount = $FormulaElementsToCountMap{$ElementSymbol};
 275     push @ElementCount, $ElementCount;
 276   }
 277 
 278   # Make sure it all adds up to 100%; otherwise, adjust the last value..
 279 
 280   return (\@FormulaElements, \@ElementCount, $ErrorMsg);
 281 }
 282 
 283 # Clean it up...
 284 sub _CleanUpFormula {
 285   my($MolecularFormula) = @_;
 286   #Take out any spaces...
 287   $MolecularFormula =~ s/ //g;
 288 
 289   # Eliminate any charge specifications: +, - or [1-9]+[+-]
 290   # e.g NO+ [Al(H2O)6]3+ [H2NO3]+
 291   if ($MolecularFormula =~ /[\+\-]/) {
 292     if ($MolecularFormula =~ /\][0-9]+[\+\-]/) {
 293       # Bracket followed optionally by number and then, +/- ...
 294       # [Al(H2O)6]3+ ...
 295       $MolecularFormula =~ s/\][0-9]+[\+\-]/\]/g;
 296     }
 297     elsif ($MolecularFormula =~ /[\+\-][0-9]*/) {
 298       # +/- followed optionally by a number...
 299       # C37H42N2O6+2, Cu+
 300       $MolecularFormula =~ s/[\+\-][0-9]*//g;
 301     }
 302   }
 303 
 304   # Eliminate any brackets - ] or ) - not followed by numbers:
 305   # e.g. Li[H2PO4]
 306   if ($MolecularFormula !~ /\][0-9]+/) {
 307     $MolecularFormula =~ s/[\[\]]//g;
 308   }
 309   if ($MolecularFormula !~ /\)[0-9]+/) {
 310     $MolecularFormula =~ s/[\(\)]//g;
 311   }
 312   # Change adducts to parenthesis format...
 313   # Na2CO3.10H2O -> Na2CO3(H2O)10
 314   # 3CdSO4.8H2O -> (CdSO4)3(H2O)8
 315   if ($MolecularFormula =~ /\./) {
 316     my($SubFormula, $Count, $Spec);
 317     my(@MolecularFormulaSplits) = split /\./, $MolecularFormula;
 318     $MolecularFormula = '';
 319     for $SubFormula (@MolecularFormulaSplits) {
 320       ($Count, $Spec) = $SubFormula =~ /^([0-9]*)(.*?)$/;
 321       if ($Count) {
 322         $MolecularFormula .= "(${Spec})${Count}";
 323       }
 324       else {
 325         $MolecularFormula .= $Spec;
 326       }
 327     }
 328   }
 329 
 330   return $MolecularFormula;
 331 }
 332 
 333 # Store the element and count...
 334 sub _SetupFormulaElementData {
 335   my($ElementsRef, $ElementsToCountMapRef, $Element, $Count) = @_;
 336 
 337   if (exists $ElementsToCountMapRef->{$Element}) {
 338     $ElementsToCountMapRef->{$Element} += $Count;
 339   }
 340   else {
 341     push @{$ElementsRef}, $Element;
 342     $ElementsToCountMapRef->{$Element} = $Count;
 343   }
 344 }
 345