1 #!/usr/bin/perl -w 2 # 3 # File: ElementalAnalysisSDFiles.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2025 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use SDFileUtil; 34 use TextUtil; 35 use MolecularFormula; 36 use FileIO::SDFileIO; 37 use Molecule; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 # Starting message... 45 $ScriptName = basename($0); 46 print "\n$ScriptName: Starting...\n\n"; 47 $StartTime = new Benchmark; 48 49 # Get the options and setup script... 50 SetupScriptUsage(); 51 if ($Options{help} || @ARGV < 1) { 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 53 } 54 55 my(@SDFilesList); 56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 57 58 print "Processing options...\n"; 59 my(%OptionsInfo); 60 ProcessOptions(); 61 62 print "Checking input SD file(s)...\n"; 63 my(%SDFilesInfo); 64 RetrieveSDFilesInfo(); 65 66 # Generate output files... 67 my($FileIndex); 68 if (@SDFilesList > 1) { 69 print "\nProcessing SD files...\n"; 70 } 71 for $FileIndex (0 .. $#SDFilesList) { 72 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 73 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 74 PerformElementalAnalysis($FileIndex); 75 } 76 } 77 print "\n$ScriptName:Done...\n\n"; 78 79 $EndTime = new Benchmark; 80 $TotalTime = timediff ($EndTime, $StartTime); 81 print "Total time: ", timestr($TotalTime), "\n"; 82 83 ############################################################################### 84 85 # Perform analysis... 86 sub PerformElementalAnalysis { 87 my($Index) = @_; 88 my($SDFile, $NewSDFile, $KeyDataFieldName, $CmpdCount, $CurrentFormula, $FormulaFieldName, $CmpdString, $Value, $CalculationType, $CalculatedValue, $ErrorMsg, $Status, $ElementsRef, $ElementCompositionRef, $Molecule, @CalculatedValues, @CmpdLines, %DataFieldValuesMap); 89 90 $SDFile = $SDFilesList[$Index]; 91 $NewSDFile = $SDFilesInfo{OutFile}[$Index]; 92 93 print "Generating new SD file $NewSDFile...\n"; 94 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; 95 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; 96 97 98 $CmpdCount = 0; 99 $FormulaFieldName = $SDFilesInfo{FormulaFieldName}[$Index]; 100 101 COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 102 $CmpdCount++; 103 @CmpdLines = split "\n", $CmpdString; 104 %DataFieldValuesMap = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 105 106 @CalculatedValues = (); 107 for $Value (@{$OptionsInfo{SpecifiedCalculations}}) { 108 push @CalculatedValues, ''; 109 } 110 111 $CurrentFormula = undef; 112 113 if ($OptionsInfo{UseStructureData}) { 114 $Molecule = FileIO::SDFileIO::ParseMoleculeString($CmpdString); 115 $CurrentFormula = $Molecule->GetMolecularFormula(); 116 } 117 else { 118 if (!exists $DataFieldValuesMap{$FormulaFieldName}) { 119 $ErrorMsg = "Ignoring compound record $CmpdCount: Formula field $FormulaFieldName not found"; 120 PrintErrorMsg($CmpdString, $ErrorMsg); 121 WriteNewCompoundRecord($Index, \*NEWSDFILE, \@CmpdLines, \@CalculatedValues); 122 next COMPOUND; 123 } 124 125 # Make sure it's a valid molecular formula... 126 $CurrentFormula = $DataFieldValuesMap{$FormulaFieldName}; 127 if ($OptionsInfo{CheckFormula}) { 128 ($Status, $ErrorMsg) = MolecularFormula::IsMolecularFormula($CurrentFormula); 129 if (!$Status) { 130 $ErrorMsg = "Ignoring compound record $CmpdCount: Formula field value $CurrentFormula is not valid: $ErrorMsg"; 131 PrintErrorMsg($CmpdString, $ErrorMsg); 132 WriteNewCompoundRecord($Index, \*NEWSDFILE, \@CmpdLines, \@CalculatedValues); 133 next COMPOUND; 134 } 135 } 136 } 137 138 # Calculate appropriate values and write 'em out... 139 @CalculatedValues = (); 140 for $CalculationType (@{$OptionsInfo{SpecifiedCalculations}}) { 141 if ($CalculationType =~ /^ElementalAnalysis$/i) { 142 ($ElementsRef, $ElementCompositionRef) = MolecularFormula::CalculateElementalComposition($CurrentFormula); 143 $CalculatedValue = (defined($ElementsRef) && defined($ElementCompositionRef)) ? MolecularFormula::FormatCompositionInfomation($ElementsRef, $ElementCompositionRef, $OptionsInfo{Precision}) : ''; 144 } 145 elsif ($CalculationType =~ /^MolecularWeight$/i) { 146 $CalculatedValue = MolecularFormula::CalculateMolecularWeight($CurrentFormula); 147 $CalculatedValue = (defined($CalculatedValue) && length($CalculatedValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CalculatedValue)) : ""; 148 } 149 elsif ($CalculationType =~ /^ExactMass$/i) { 150 $CalculatedValue = MolecularFormula::CalculateExactMass($CurrentFormula); 151 $CalculatedValue = (defined($CalculatedValue) && length($CalculatedValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CalculatedValue)) : ""; 152 } 153 else { 154 $CalculatedValue = ''; 155 } 156 push @CalculatedValues, $CalculatedValue; 157 } 158 WriteNewCompoundRecord($Index, \*NEWSDFILE, \@CmpdLines, \@CalculatedValues, $CurrentFormula); 159 } 160 close NEWSDFILE; 161 close SDFILE; 162 } 163 164 # Write out compound record with calculated values... 165 sub WriteNewCompoundRecord { 166 my($Index, $SDFileRef, $CmpdLinesRef, $CalculatedValuesRef, $MolecularFormula) = @_; 167 168 # Write out compound lines except the last line which contains $$$$... 169 my($LineIndex); 170 for $LineIndex (0 .. ($#{$CmpdLinesRef} - 1)) { 171 print $SDFileRef "$CmpdLinesRef->[$LineIndex]\n"; 172 } 173 174 # Write out calculated values... 175 my($CalcIndex, $FieldName, $FieldValue); 176 for $CalcIndex (0 .. $#{$OptionsInfo{SpecifiedCalculations}}) { 177 $FieldName = $SDFilesInfo{ValueFieldNamesMap}[$Index]{$OptionsInfo{SpecifiedCalculations}[$CalcIndex]}; 178 $FieldValue = $CalculatedValuesRef->[$CalcIndex]; 179 print $SDFileRef "> <$FieldName>\n$FieldValue\n\n"; 180 } 181 182 if ($OptionsInfo{UseStructureData} && $OptionsInfo{WriteOutFormula} && defined($MolecularFormula)) { 183 $FieldName = $SDFilesInfo{ValueFieldNamesMap}[$Index]{MolecularFormula}; 184 $FieldValue = $MolecularFormula; 185 print $SDFileRef "> <$FieldName>\n$FieldValue\n\n"; 186 } 187 188 print $SDFileRef "\$\$\$\$\n"; 189 } 190 191 # Print out error message... 192 sub PrintErrorMsg { 193 my($CmpdString, $ErrorMsg) = @_; 194 195 if ($OptionsInfo{DetailLevel} >= 2 ) { 196 print "$ErrorMsg:\n$CmpdString\n"; 197 } 198 elsif ($OptionsInfo{DetailLevel} >= 1) { 199 print "$ErrorMsg\n"; 200 } 201 } 202 203 # Retrieve information about input SD files... 204 sub RetrieveSDFilesInfo { 205 my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFile, $FormulaFieldName, $Value, $FieldName, $NewFieldName, $Count); 206 207 my(%NewValueFieldNames) = (ElementalAnalysis => 'ElementalAnalysis', MolecularWeight => 'MolecularWeight', ExactMass => 'ExactMass', MolecularFormula => 'MolecularFormula'); 208 if (@{$OptionsInfo{SpecifiedValueFieldNames}}) { 209 for ($Index = 0; $Index < @{$OptionsInfo{SpecifiedValueFieldNames}}; $Index +=2) { 210 $Value = $OptionsInfo{SpecifiedValueFieldNames}[$Index]; 211 $FieldName = $OptionsInfo{SpecifiedValueFieldNames}[$Index + 1]; 212 if (exists $NewValueFieldNames{$Value}) { 213 $NewValueFieldNames{$Value} = $FieldName; 214 } 215 } 216 } 217 218 %SDFilesInfo = (); 219 220 @{$SDFilesInfo{FileOkay}} = (); 221 @{$SDFilesInfo{OutFile}} = (); 222 @{$SDFilesInfo{FormulaFieldName}} = (); 223 @{$SDFilesInfo{ValueFieldNamesMap}} = (); 224 225 FILELIST: for $Index (0 .. $#SDFilesList) { 226 $SDFile = $SDFilesList[$Index]; 227 228 $SDFilesInfo{FileOkay}[$Index] = 0; 229 $SDFilesInfo{OutFile}[$Index] = ''; 230 $SDFilesInfo{FormulaFieldName}[$Index] = ''; 231 232 %{$SDFilesInfo{ValueFieldNamesMap}[$Index]} = (); 233 234 if (!(-e $SDFile)) { 235 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 236 next FILELIST; 237 } 238 if (!CheckFileType($SDFile, "sd sdf")) { 239 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 240 next FILELIST; 241 } 242 $FileDir = ""; $FileName = ""; $FileExt = ""; 243 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 244 if ($Options{root} && (@SDFilesList == 1)) { 245 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 246 if ($RootFileName && $RootFileExt) { 247 $FileName = $RootFileName; 248 } 249 else { 250 $FileName = $Options{root}; 251 } 252 $OutFileRoot = $FileName; 253 } 254 else { 255 $OutFileRoot = $FileName . "ElementalAnalysis"; 256 } 257 258 $OutFile = $OutFileRoot . ".$FileExt"; 259 if (lc($OutFile) eq lc($SDFile)) { 260 warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n"; 261 next FILELIST; 262 } 263 if (!$Options{overwrite}) { 264 if (-e $OutFile) { 265 warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n"; 266 next FILELIST; 267 } 268 } 269 # Get data field names and values... 270 my($CmpdString, $FieldName, @CmpdLines, @DataFieldNames, %DataFieldNamesMap); 271 @DataFieldNames = (); 272 if (!open(SDFILE, "$SDFile")) { 273 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 274 next FILELIST; 275 } 276 $CmpdString = ReadCmpdString(\*SDFILE); 277 close SDFILE; 278 279 @CmpdLines = split "\n", $CmpdString; 280 @DataFieldNames = GetCmpdDataHeaderLabels(\@CmpdLines); 281 %DataFieldNamesMap = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 282 283 # Setup formula field name... 284 $FormulaFieldName = ''; 285 if ($OptionsInfo{UseDataField}) { 286 if ($OptionsInfo{SpecifiedFormulaFieldName}) { 287 $FormulaFieldName = $OptionsInfo{SpecifiedFormulaFieldName}; 288 } 289 else { 290 FIELDNAME: for $FieldName (@DataFieldNames) { 291 if ($FieldName =~ /Formula/i) { 292 $FormulaFieldName = $FieldName; 293 last FIELDNAME; 294 } 295 } 296 if (!$FormulaFieldName) { 297 warn "Warning: Ignoring file $SDFile: Data field label containing the word Formula doesn't exist\n"; 298 next FILELIST; 299 } 300 } 301 } 302 $SDFilesInfo{FileOkay}[$Index] = 1; 303 $SDFilesInfo{OutFile}[$Index] = $OutFile; 304 $SDFilesInfo{FormulaFieldName}[$Index] = $FormulaFieldName; 305 306 # Setup value data field names for calculated values... 307 for $Value (keys %NewValueFieldNames) { 308 $FieldName = $NewValueFieldNames{$Value}; 309 310 # Make sure it doesn't already exists... 311 $Count = 1; 312 $NewFieldName = $FieldName; 313 while (exists $DataFieldNamesMap{$NewFieldName}) { 314 $Count++; 315 $NewFieldName = $FieldName . $Count; 316 } 317 $SDFilesInfo{ValueFieldNamesMap}[$Index]{$Value} = $NewFieldName; 318 } 319 } 320 } 321 322 # Process option values... 323 sub ProcessOptions { 324 %OptionsInfo = (); 325 326 $OptionsInfo{Mode} = $Options{mode}; 327 $OptionsInfo{FormulaMode} = $Options{formulamode}; 328 329 $OptionsInfo{WriteOutFormula} = ($Options{formulaout} =~ /^Yes$/i) ? 1 : 0; 330 331 $OptionsInfo{UseStructureData} = ($Options{formulamode} =~ /^StructureData$/i) ? 1 : 0; 332 $OptionsInfo{UseDataField} = ($Options{formulamode} =~ /^DataField$/i) ? 1 : 0; 333 334 $OptionsInfo{Fast} = defined $Options{fast} ? $Options{fast} : undef; 335 336 $OptionsInfo{DetailLevel} = $Options{detail}; 337 $OptionsInfo{CheckFormula} = $Options{fast} ? 0 : 1; 338 $OptionsInfo{Precision} = $Options{precision}; 339 340 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; 341 342 $OptionsInfo{FormulaField} = defined $Options{formulafield} ? $Options{formulafield} : undef; 343 $OptionsInfo{SpecifiedFormulaFieldName} = ""; 344 345 if (defined $Options{formulafield}) { 346 $OptionsInfo{SpecifiedFormulaFieldName} = $Options{formulafield}; 347 } 348 # Setup what to calculate... 349 @{$OptionsInfo{SpecifiedCalculations}} = (); 350 if ($Options{mode} =~ /^All$/i) { 351 @{$OptionsInfo{SpecifiedCalculations}} = qw(ElementalAnalysis MolecularWeight ExactMass); 352 } 353 else { 354 my($Mode, $ModeValue, @SpecifiedModeValues); 355 $Mode = $Options{mode}; 356 $Mode =~ s/ //g; 357 @SpecifiedModeValues = split /\,/, $Mode; 358 for $ModeValue (@SpecifiedModeValues) { 359 if ($ModeValue !~ /^(ElementalAnalysis|MolecularWeight|ExactMass)$/i) { 360 if ($ModeValue =~ /^All$/i) { 361 die "Error: All value for option \"-m --mode\" is not allowed with other valid values.\n"; 362 } 363 else { 364 die "Error: The value specified, $ModeValue, for option \"-m --mode\" is not valid. Allowed values: ElementalAnalysis, MolecularWeight, or ExactMass\n"; 365 } 366 } 367 push @{$OptionsInfo{SpecifiedCalculations}}, $ModeValue; 368 } 369 } 370 371 $OptionsInfo{ValueFieldNames} = defined $Options{valuefieldnames} ? $Options{valuefieldnames} : undef; 372 @{$OptionsInfo{SpecifiedValueFieldNames}} = (); 373 374 if ($Options{valuefieldnames}) { 375 my($Value, $Label, @ValueLabels); 376 @ValueLabels = split /\,/, $Options{valuefieldnames}; 377 if (@ValueLabels % 2) { 378 die "Error: The value specified, $Options{valuefieldnames}, for option \"-v --valuefieldnames\" is not valid: It must contain even number of comma delimited values\n"; 379 } 380 my($Index); 381 for ($Index = 0; $Index < @ValueLabels; $Index +=2) { 382 $Value = $ValueLabels[$Index]; 383 $Value =~ s/ //g; 384 $Label = $ValueLabels[$Index + 1]; 385 if ($Value !~ /^(ElementalAnalysis|MolecularWeight|ExactMass|MolecularFormula)$/i) { 386 die "Error: The value specified, $Value, using option \"-v --valuefieldnames\" is not valid. Allowed values: ElementalAnalysis, MolecularWeight, ExactMass, or MolecularFormula\n"; 387 } 388 push @{$OptionsInfo{SpecifiedValueFieldNames}}, ($Value, $Label); 389 } 390 } 391 } 392 393 # Setup script usage and retrieve command line arguments specified using various options... 394 sub SetupScriptUsage { 395 396 # Retrieve all the options... 397 %Options = (); 398 $Options{detail} = 1; 399 $Options{formulamode} = "DataField"; 400 $Options{formulaout} = "No"; 401 $Options{mode} = "All"; 402 $Options{precision} = 2; 403 404 if (!GetOptions(\%Options, "detail|d=i", "fast", "formulafield=s", "formulamode|f=s", "formulaout=s", "mode|m=s", "help|h", "overwrite|o", "precision|p=i", "root|r=s", "valuefieldnames|v=s", "workingdir|w=s")) { 405 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 406 } 407 if ($Options{workingdir}) { 408 if (! -d $Options{workingdir}) { 409 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 410 } 411 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 412 } 413 if (!IsPositiveInteger($Options{detail})) { 414 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 415 } 416 if ($Options{formulamode} !~ /^(StructureData|DataField)$/i) { 417 die "Error: The value specified, $Options{formulamode}, for option \"-f, --formulamode\" is not valid. Allowed values: StructureData or DataField \n"; 418 } 419 if ($Options{formulaout} !~ /^(Yes|No)$/i) { 420 die "Error: The value specified, $Options{formulaout}, for option \"--formulaout\" is not valid. Allowed values: Yes or No \n"; 421 } 422 if (!IsPositiveInteger($Options{precision})) { 423 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n"; 424 } 425 } 426