1 package MolecularFormula; 2 # 3 # File: MolecularFormula.pm 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use Carp; 28 use Text::ParseWords; 29 use TextUtil; 30 use PeriodicTable; 31 32 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 33 34 @ISA = qw(Exporter); 35 @EXPORT = qw(); 36 @EXPORT_OK = qw(CalculateMolecularWeight CalculateExactMass CalculateElementalComposition FormatCompositionInfomation GetElementsAndCount IsMolecularFormula); 37 38 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 39 40 # 41 # Calculate molecular weight assuming its a valid molecular formula... 42 # 43 sub CalculateMolecularWeight { 44 my($MolecularFormula) = @_; 45 my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $FormulaElementsRef, $FormulaElementCountRef); 46 47 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula); 48 if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) { 49 return undef; 50 } 51 52 $MolecularWeight = 0; 53 54 for $Index (0 .. $#{$FormulaElementsRef}) { 55 $ElementSymbol = $FormulaElementsRef->[$Index]; 56 $ElementCount = $FormulaElementCountRef->[$Index]; 57 $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol); 58 $MolecularWeight += $AtomicWeight * $ElementCount; 59 } 60 return $MolecularWeight; 61 } 62 63 # 64 # Calculate exact mass assuming it's a valid formula... 65 # 66 sub CalculateExactMass { 67 my($MolecularFormula) = @_; 68 my($Index, $ElementSymbol, $ElementCount, $ExactMass, $RelativeAtomicMass, $FormulaElementsRef, $FormulaElementCountRef); 69 70 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula); 71 if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) { 72 return undef; 73 } 74 $ExactMass = 0; 75 76 for $Index (0 .. $#{$FormulaElementsRef}) { 77 $ElementSymbol = $FormulaElementsRef->[$Index]; 78 $ElementCount = $FormulaElementCountRef->[$Index]; 79 $RelativeAtomicMass = PeriodicTable::GetElementMostAbundantNaturalIsotopeMass($ElementSymbol); 80 if (!defined($RelativeAtomicMass)) { 81 next ELEMENT; 82 } 83 $ExactMass += $RelativeAtomicMass * $ElementCount; 84 } 85 return $ExactMass; 86 } 87 88 89 # 90 # Calculate elemental composition and return reference to arrays 91 # containing elements and their percent composition... 92 # 93 sub CalculateElementalComposition { 94 my($MolecularFormula) = @_; 95 my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $Composition, $CompositionMultiplier, $FormulaElementsRef, $FormulaElementCountRef, @FormulaElements, @FormulaElementComposition); 96 97 $MolecularWeight = CalculateMolecularWeight($MolecularFormula); 98 if (! defined $MolecularWeight) { 99 return (undef, undef); 100 } 101 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula); 102 103 @FormulaElements = (); 104 @FormulaElementComposition = (); 105 106 if (!$MolecularWeight) { 107 return ( \@FormulaElements, \@FormulaElementComposition); 108 } 109 110 $CompositionMultiplier = 100 / $MolecularWeight; 111 112 for $Index (0 .. $#{$FormulaElementsRef}) { 113 $ElementSymbol = $FormulaElementsRef->[$Index]; 114 $ElementCount = $FormulaElementCountRef->[$Index]; 115 $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol); 116 $Composition = ($AtomicWeight * $ElementCount) * $CompositionMultiplier; 117 118 push @FormulaElements, $ElementSymbol; 119 push @FormulaElementComposition, $Composition; 120 } 121 122 return ( \@FormulaElements, \@FormulaElementComposition); 123 } 124 125 # Using refernece to element and its composition arrays, format composition information 126 # as: Element: Composition;... 127 # 128 sub FormatCompositionInfomation { 129 my($Index, $ElementSymbol, $ElementComposition, $ElementsRef, $ElementCompositionRef, $Precision, $Composition); 130 131 $Precision = 2; 132 if (@_ == 3) { 133 ($ElementsRef, $ElementCompositionRef, $Precision) = @_; 134 } 135 else { 136 ($ElementsRef, $ElementCompositionRef) = @_; 137 } 138 139 $Composition = ''; 140 for $Index (0 .. $#{$ElementsRef}) { 141 $ElementSymbol = $ElementsRef->[$Index]; 142 $ElementComposition = $ElementCompositionRef->[$Index]; 143 $ElementComposition = sprintf("%.${Precision}f", $ElementComposition); 144 145 $Composition .= ($Composition) ? '; ' : ''; 146 $Composition .= "${ElementSymbol}: ${ElementComposition}%"; 147 } 148 149 return $Composition; 150 } 151 152 # 153 # Get elements and their count... 154 # 155 sub GetElementsAndCount { 156 my($MolecularFormula) = @_; 157 my($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg); 158 159 ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula); 160 161 return ($FormulaElementsRef, $FormulaElementCountRef); 162 } 163 164 # 165 # Is it a valid molecular formula? 166 # 167 sub IsMolecularFormula { 168 my($MolecularFormula, $PrintErrorMsg, $Status, $FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg); 169 170 ($MolecularFormula) = @_; 171 172 ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula); 173 $Status = (defined($FormulaElementsRef) && defined($FormulaElementCountRef)) ? 1 : 0; 174 175 return (wantarray ? ($Status, $ErrorMsg) : $Status); 176 } 177 178 # 179 # Process molecular formula. For a valid formula, return references to arrays conatining elements 180 # and element count; otherwsie, return undef. 181 # 182 sub _ProcessMolecularFormula { 183 my($MolecularFormula) = @_; 184 my($ErrorMsg) = ''; 185 186 $MolecularFormula = _CleanUpFormula($MolecularFormula); 187 188 # Make sure it only contains numbers and letters... 189 if ($MolecularFormula =~ /[^a-zA-Z0-9\(\)\[\]]/) { 190 $ErrorMsg = 'Molecular formula contains characters other than a-zA-Z0-9'; 191 return (undef, undef, $ErrorMsg); 192 } 193 194 # Parse the formula... 195 my($ElementSpec, $FormulaElementSpec, $Spec, $ElementSymbol, $ElementCount, @FormulaElements, @ElementCount, %FormulaElementsToCountMap, @SubFormulaElements, %SubFormulaElementsToCountMap); 196 197 @FormulaElements = (); @ElementCount = (); 198 %FormulaElementsToCountMap = (); 199 200 # Setup element symbol and count regular expression... 201 # IUPAC: http://www.iupac.org/reports/provisional/abstract04/RB-prs310804/Chap4-3.04.pdf 202 # 203 204 $FormulaElementSpec = qr/ 205 \G( # $1 206 (?: 207 ([A-Z][a-z]?) # Two or one letter element symbol; $2 208 ([0-9]*) # Optionally followed by element count; $3 209 ) 210 | \( | \[ 211 | \)[0-9]* | \][0-9]* 212 | . 213 ) 214 /x; 215 216 my($ProcessingParenthesis); 217 $ProcessingParenthesis = 0; 218 # Go over the formula... 219 FORMULA: while ($MolecularFormula =~ /$FormulaElementSpec/gx) { 220 ($Spec, $ElementSymbol, $ElementCount) = ($1, $2, $3); 221 222 # Handle parenthesis in formula to indicate repeating units... 223 if ($Spec =~ /^(\(|\[)/) { 224 if ($ProcessingParenthesis) { 225 $ErrorMsg = "Molecular formula contains multiple level of () or []"; 226 return (undef, undef, $ErrorMsg); 227 } 228 $ProcessingParenthesis = 1; 229 @SubFormulaElements = (); 230 %SubFormulaElementsToCountMap = (); 231 next FORMULA; 232 } 233 elsif ($Spec =~ /^(\)|\])/) { 234 $ProcessingParenthesis = 0; 235 236 # Retrieve repeat count and move data to @FormulaElements and %FormulaElementsToCountMap; 237 my($RepeatCount, $Symbol, $Count); 238 $RepeatCount = $Spec; 239 $RepeatCount =~ s/(\)|\])//g; 240 if (!$RepeatCount) { 241 $RepeatCount = 1; 242 } 243 # Copy data... 244 for $Symbol (@SubFormulaElements) { 245 $Count = $SubFormulaElementsToCountMap{$Symbol} * $RepeatCount; 246 _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $Symbol, $Count); 247 } 248 249 # Get ready again... 250 @SubFormulaElements = (); 251 %SubFormulaElementsToCountMap = (); 252 253 next FORMULA; 254 } 255 256 # Retrieve element symbol and count... 257 $ElementSymbol = ($Spec && !$ElementSymbol) ? $Spec : ($ElementSymbol ? $ElementSymbol : ''); 258 $ElementCount = $ElementCount ? $ElementCount : 1; 259 if (!PeriodicTable::IsElement($ElementSymbol)) { 260 $ErrorMsg = "Molecular formula contains unknown elemental symbol $ElementSymbol"; 261 return (undef, undef, $ErrorMsg); 262 } 263 264 if ($ProcessingParenthesis) { 265 _SetupFormulaElementData(\@SubFormulaElements, \%SubFormulaElementsToCountMap, $ElementSymbol, $ElementCount); 266 } 267 else { 268 _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $ElementSymbol, $ElementCount); 269 } 270 } 271 272 # Setup element count array... 273 for $ElementSymbol (@FormulaElements) { 274 $ElementCount = $FormulaElementsToCountMap{$ElementSymbol}; 275 push @ElementCount, $ElementCount; 276 } 277 278 # Make sure it all adds up to 100%; otherwise, adjust the last value.. 279 280 return (\@FormulaElements, \@ElementCount, $ErrorMsg); 281 } 282 283 # Clean it up... 284 sub _CleanUpFormula { 285 my($MolecularFormula) = @_; 286 #Take out any spaces... 287 $MolecularFormula =~ s/ //g; 288 289 # Eliminate any charge specifications: +, - or [1-9]+[+-] 290 # e.g NO+ [Al(H2O)6]3+ [H2NO3]+ 291 if ($MolecularFormula =~ /[\+\-]/) { 292 if ($MolecularFormula =~ /\][0-9]+[\+\-]/) { 293 # Bracket followed optionally by number and then, +/- ... 294 # [Al(H2O)6]3+ ... 295 $MolecularFormula =~ s/\][0-9]+[\+\-]/\]/g; 296 } 297 elsif ($MolecularFormula =~ /[\+\-][0-9]*/) { 298 # +/- followed optionally by a number... 299 # C37H42N2O6+2, Cu+ 300 $MolecularFormula =~ s/[\+\-][0-9]*//g; 301 } 302 } 303 304 # Eliminate any brackets - ] or ) - not followed by numbers: 305 # e.g. Li[H2PO4] 306 if ($MolecularFormula !~ /\][0-9]+/) { 307 $MolecularFormula =~ s/[\[\]]//g; 308 } 309 if ($MolecularFormula !~ /\)[0-9]+/) { 310 $MolecularFormula =~ s/[\(\)]//g; 311 } 312 # Change adducts to parenthesis format... 313 # Na2CO3.10H2O -> Na2CO3(H2O)10 314 # 3CdSO4.8H2O -> (CdSO4)3(H2O)8 315 if ($MolecularFormula =~ /\./) { 316 my($SubFormula, $Count, $Spec); 317 my(@MolecularFormulaSplits) = split /\./, $MolecularFormula; 318 $MolecularFormula = ''; 319 for $SubFormula (@MolecularFormulaSplits) { 320 ($Count, $Spec) = $SubFormula =~ /^([0-9]*)(.*?)$/; 321 if ($Count) { 322 $MolecularFormula .= "(${Spec})${Count}"; 323 } 324 else { 325 $MolecularFormula .= $Spec; 326 } 327 } 328 } 329 330 return $MolecularFormula; 331 } 332 333 # Store the element and count... 334 sub _SetupFormulaElementData { 335 my($ElementsRef, $ElementsToCountMapRef, $Element, $Count) = @_; 336 337 if (exists $ElementsToCountMapRef->{$Element}) { 338 $ElementsToCountMapRef->{$Element} += $Count; 339 } 340 else { 341 push @{$ElementsRef}, $Element; 342 $ElementsToCountMapRef->{$Element} = $Count; 343 } 344 } 345