1 #!/usr/bin/perl -w 2 # 3 # File: ModifySDFilesDataFields.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use SDFileUtil; 34 use TextUtil; 35 36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 37 38 # Autoflush STDOUT 39 $| = 1; 40 41 # Starting message... 42 $ScriptName = basename($0); 43 print "\n$ScriptName: Starting...\n\n"; 44 $StartTime = new Benchmark; 45 46 # Get the options and setup script... 47 SetupScriptUsage(); 48 if ($Options{help} || @ARGV < 1) { 49 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 50 } 51 52 my(@SDFilesList); 53 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 54 55 # Process options... 56 print "Processing options...\n"; 57 my(%OptionsInfo); 58 ProcessOptions(); 59 60 print "Checking input SD file(s)...\n"; 61 my(%SDFilesInfo); 62 RetrieveSDFilesInfo(); 63 64 # Generate output files... 65 my($FileIndex); 66 if (@SDFilesList > 1) { 67 print "\nProcessing SD files...\n"; 68 } 69 for $FileIndex (0 .. $#SDFilesList) { 70 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 71 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 72 ModifySDFile($FileIndex); 73 } 74 } 75 print "\n$ScriptName:Done...\n\n"; 76 77 $EndTime = new Benchmark; 78 $TotalTime = timediff ($EndTime, $StartTime); 79 print "Total time: ", timestr($TotalTime), "\n"; 80 81 ############################################################################### 82 83 # Modify SD file data fields.... 84 sub ModifySDFile { 85 my($Index) = @_; 86 my($SDFile, $NewSDFile); 87 88 $SDFile = $SDFilesList[$Index]; 89 $NewSDFile = $SDFilesInfo{OutFile}[$Index]; 90 91 print "Generating new SD file $NewSDFile...\n"; 92 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; 93 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; 94 95 my($CmpdCount, $CmpdString, $CmpdData, $MolName, $OldSDField, $NewSDField, $CommonSDField, $Label, $Value, $FieldValues, $MolNameDataField, $URLCmpdIdFieldName, @CmpdLines, %DataFieldAndValues, @DataFieldLabels); 96 $CmpdCount = 0; 97 98 COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 99 $CmpdCount++; 100 @CmpdLines = split "\n", $CmpdString; 101 if ($OptionsInfo{UseDataFieldForMolName} || $OptionsInfo{ModifyDataFields}) { 102 %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 103 } 104 if ($OptionsInfo{ModifyMolName}) { 105 if ($OptionsInfo{AlwaysReplaceMolName} || !IsNotEmpty($CmpdLines[0])) { 106 $MolNameDataField = $OptionsInfo{MolNameDataField}; 107 if ($OptionsInfo{UseDataFieldForMolName} && exists($DataFieldAndValues{$MolNameDataField})) { 108 $MolName = $DataFieldAndValues{$MolNameDataField}; 109 if (length($MolName) > 80) { 110 $MolName = substr($MolName, 0, 80); 111 } 112 } 113 else { 114 $MolName = "$OptionsInfo{MolNamePrefix}${CmpdCount}"; 115 } 116 $CmpdLines[0] = $MolName; 117 $CmpdString = join "\n", @CmpdLines; 118 } 119 } 120 if (!$OptionsInfo{ModifyDataFields}) { 121 # Just write the data and get the next compound... 122 print NEWSDFILE "$CmpdString\n"; 123 next COMPOUND; 124 } 125 # Write out the structure data now and handle the old data fields later... 126 ($CmpdData) = split /\n>/, $CmpdString; 127 print NEWSDFILE "$CmpdData\n"; 128 129 # Modify specified data fields... 130 for $NewSDField (sort keys %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}}) { 131 $FieldValues = ""; 132 for $OldSDField (@{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}) { 133 if (exists($DataFieldAndValues{$OldSDField}) && length($DataFieldAndValues{$OldSDField})) { 134 $Value = $DataFieldAndValues{$OldSDField}; 135 $FieldValues .= ($FieldValues) ? "\n$Value" : $Value; 136 } 137 } 138 print NEWSDFILE "> <$NewSDField>\n$FieldValues\n\n"; 139 } 140 # Add specified common fields... 141 for $CommonSDField (sort keys %{$OptionsInfo{SpecifiedCommonFieldMap}}) { 142 $Value = $OptionsInfo{SpecifiedCommonFieldMap}{$CommonSDField}; 143 print NEWSDFILE "> <$CommonSDField>\n$Value\n\n"; 144 } 145 if ($OptionsInfo{CreateDataFieldURL}) { 146 $Value = ""; 147 $URLCmpdIdFieldName = $OptionsInfo{URLCmpdIdFieldName}; 148 if (exists($DataFieldAndValues{$URLCmpdIdFieldName}) && length($DataFieldAndValues{$URLCmpdIdFieldName})) { 149 $Value = $DataFieldAndValues{$URLCmpdIdFieldName}; 150 $Value = "$OptionsInfo{URLCGIScriptName}?$OptionsInfo{URLParamName}=${Value}"; 151 } 152 print NEWSDFILE "> <$OptionsInfo{URLDataFieldLabel}>\n$Value\n\n"; 153 } 154 155 # Handle old data fields and write 'em in the same order as they appear in the input 156 # files... 157 if ($OptionsInfo{KeepAllOldDataFields} || $OptionsInfo{KeepUnMappedOldDataFields}) { 158 my($KeepLabel); 159 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 160 LABEL: for $Label (@DataFieldLabels) { 161 $KeepLabel = $OptionsInfo{KeepAllOldDataFields} ? 1 : ( exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$Label}) ? 0 : 1 ); 162 if (!$KeepLabel) { 163 next LABEL; 164 } 165 $Value = $DataFieldAndValues{$Label}; 166 print NEWSDFILE "> <$Label>\n$Value\n\n"; 167 } 168 } 169 170 print NEWSDFILE "\$\$\$\$\n"; 171 } 172 close NEWSDFILE; 173 close SDFILE; 174 } 175 176 # Process option values... 177 sub ProcessOptions { 178 %OptionsInfo = (); 179 180 $OptionsInfo{Mode} = $Options{mode}; 181 182 $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 0; 183 if ($Options{mode} =~ /^both$/i) { 184 $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 1; 185 } 186 elsif ($Options{mode} =~ /^datafields$/i) { 187 $OptionsInfo{ModifyMolName} = 0; $OptionsInfo{ModifyDataFields} = 1; 188 } 189 190 $OptionsInfo{KeepOldDataFields} = $Options{keepolddatafields}; 191 $OptionsInfo{KeepAllOldDataFields} = ($Options{keepolddatafields} =~ /^all$/i) ? 1 : 0; 192 $OptionsInfo{KeepUnMappedOldDataFields} = ($Options{keepolddatafields} =~ /^unmappedonly$/i) ? 1 : 0; 193 194 $OptionsInfo{MolNameMode} = $Options{molnamemode}; 195 $OptionsInfo{UseDataFieldForMolName} = ($Options{molnamemode} =~ /^datafield$/i) ? 1 : 0; 196 197 $OptionsInfo{MolName} = $Options{molname}; 198 $OptionsInfo{MolNameDataField} = ""; $OptionsInfo{MolNamePrefix} = "Cmpd"; 199 if ($Options{molname}) { 200 if ($OptionsInfo{UseDataFieldForMolName}) { 201 $OptionsInfo{MolNameDataField} = $Options{molname}; 202 } 203 else { 204 $OptionsInfo{MolNamePrefix} = $Options{molname}; 205 } 206 } 207 208 $OptionsInfo{MolNameReplace} = $Options{molnamereplace}; 209 $OptionsInfo{AlwaysReplaceMolName} = ($Options{molnamereplace} =~ /^always$/i) ? 1 : 0; 210 211 if ($Options{datafieldsmap} && $Options{datafieldsmapfile}) { 212 die "Error: Both \"--datafieldsmap\" and \"--datafieldsmapfile\" options specified: only one is allowed at a time\n"; 213 } 214 215 $OptionsInfo{DataFieldsMap} = $Options{datafieldsmap} ? $Options{datafieldsmap} : ''; 216 $OptionsInfo{DataFieldsMapFile} = $Options{datafieldsmapfile} ? $Options{datafieldsmapfile} : ''; 217 218 my($SpecifiedDataFieldMap); 219 220 %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}} = (); 221 %{$OptionsInfo{SpecifiedOldToNewSDFieldMap}} = (); 222 223 $SpecifiedDataFieldMap = ""; 224 if ($Options{datafieldsmap}) { 225 $SpecifiedDataFieldMap = $Options{datafieldsmap}; 226 } 227 elsif ($Options{datafieldsmapfile}) { 228 my($Line, @LineWords); 229 open DATAFIELDSFILE, "$Options{datafieldsmapfile}" or die "Couldn't open $Options{datafieldsmapfile}: $! \n"; 230 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 231 @LineWords = quotewords(";", 0, $Line); 232 $SpecifiedDataFieldMap .= JoinWords(\@LineWords, ";", 0); 233 } 234 close DATAFIELDSFILE; 235 } 236 237 if ($SpecifiedDataFieldMap) { 238 my($DataFieldMap, $DataField, $NewSDField, @OldSDFields, @DataFieldMapSplit, @DataFieldsSplit, $FirstField); 239 @DataFieldMapSplit = split ";", $SpecifiedDataFieldMap; 240 for $DataFieldMap (@DataFieldMapSplit) { 241 @DataFieldsSplit = split ",", $DataFieldMap; 242 if (@DataFieldsSplit == 1) { 243 die "Error: Invalid number of comma delimited values, ", scalar(@DataFieldsSplit), ", specified, @DataFieldsSplit, using \"--datafieldsmap or --datafieldsmapfile\" option: it must contain more than one value.\n"; 244 } 245 $FirstField = 1; 246 @OldSDFields = (); 247 for $DataField (@DataFieldsSplit) { 248 if (!(defined($DataField) && length($DataField))) { 249 die "Error: One of the comma delimited values, \"", join(",", @DataFieldsSplit), "\", specified using \"--datafieldsmap or --datafieldsmapfile\" option is empty.\n"; 250 } 251 if ($FirstField) { 252 $FirstField = 0; 253 $NewSDField = $DataField; 254 } 255 else { 256 push @OldSDFields, $DataField; 257 } 258 } 259 # Make sure a datafield is only specified once... 260 if (exists $OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}) { 261 die "Error: New data field, $NewSDField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n"; 262 } 263 @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}} = (); 264 push @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}, @OldSDFields; 265 for $DataField (@OldSDFields) { 266 if (exists $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} ) { 267 die "Error: SD field, $DataField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n"; 268 } 269 else { 270 $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} = $NewSDField; 271 } 272 } 273 274 } 275 } 276 277 $OptionsInfo{DataFieldsCommon} = $Options{datafieldscommon} ? $Options{datafieldscommon} : ''; 278 %{$OptionsInfo{SpecifiedCommonFieldMap}} = (); 279 280 if ($Options{datafieldscommon}) { 281 my($DataFieldName, $DataFieldValue, $Index, @CommonDataFieldsSplit); 282 @CommonDataFieldsSplit = split ",", $Options{datafieldscommon}; 283 if (@CommonDataFieldsSplit % 2) { 284 die "Error: Invalid number of comma delimited values, ", scalar(@CommonDataFieldsSplit), ", specified \"", join(",", @CommonDataFieldsSplit), "\" using \"--datafieldscommon\" option: it must contain even number of values.\n"; 285 } 286 for ($Index = 0; $Index < @CommonDataFieldsSplit; $Index += 2) { 287 $DataFieldName = $CommonDataFieldsSplit[$Index]; 288 $DataFieldValue = $CommonDataFieldsSplit[$Index + 1]; 289 if (exists $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName}) { 290 die "Error: Common data field, $DataFieldName, specified more than once using \"--datafieldscommon\" option.\n"; 291 } 292 if (exists($OptionsInfo{SpecifiedNewToOldSDFieldMap}{$DataFieldName}) || exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataFieldName})) { 293 die "Error: Common data field, $DataFieldName, specified using \"--datafieldscommon\" option cannot be specified in \"--datafieldsmap or --datafieldsmapfile\" option.\n"; 294 } 295 $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName} = $DataFieldValue; 296 } 297 } 298 299 $OptionsInfo{DataFieldURL} = $Options{datafieldurl} ? $Options{datafieldurl} : ''; 300 $OptionsInfo{CreateDataFieldURL} = (exists($Options{datafieldurl}) && length($Options{datafieldurl}) ) ? 1 : 0; 301 302 $OptionsInfo{URLDataFieldLabel} = ""; $OptionsInfo{URLCGIScriptName} = ""; 303 $OptionsInfo{URLParamName} = ""; $OptionsInfo{URLCmpdIdFieldName} = ""; 304 305 if ($OptionsInfo{CreateDataFieldURL}) { 306 my(@DataFieldURLSplit, $Value); 307 @DataFieldURLSplit = split ",", $Options{datafieldurl}; 308 if (@DataFieldURLSplit != 4) { 309 die "Error: Invalid number of values, ", scalar(@DataFieldURLSplit), ", specified using \"--datafieldURL\" option: it must contain 4 values.\n"; 310 } 311 for $Value (@DataFieldURLSplit) { 312 if (!IsNotEmpty($Value)) { 313 die "Error: One of the values, $Options{datafieldurl}, specified using \"--datafieldURL\" option is empty.\n"; 314 } 315 } 316 $OptionsInfo{URLDataFieldLabel} = $DataFieldURLSplit[0]; 317 $OptionsInfo{URLCGIScriptName} = $DataFieldURLSplit[1]; 318 $OptionsInfo{URLParamName} = $DataFieldURLSplit[2]; 319 $OptionsInfo{URLCmpdIdFieldName} = $DataFieldURLSplit[3]; 320 } 321 322 } 323 324 # Retrieve information about input SD files... 325 sub RetrieveSDFilesInfo { 326 my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFile, $DataFieldName); 327 328 %SDFilesInfo = (); 329 @{$SDFilesInfo{FileOkay}} = (); 330 @{$SDFilesInfo{OutFile}} = (); 331 332 FILELIST: for $Index (0 .. $#SDFilesList) { 333 $SDFile = $SDFilesList[$Index]; 334 335 $SDFilesInfo{FileOkay}[$Index] = 0; 336 $SDFilesInfo{OutFile}[$Index] = ''; 337 338 if (!(-e $SDFile)) { 339 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 340 next FILELIST; 341 } 342 if (!CheckFileType($SDFile, "sd sdf")) { 343 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 344 next FILELIST; 345 } 346 $FileDir = ""; $FileName = ""; $FileExt = ""; 347 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 348 if ($Options{root} && (@SDFilesList == 1)) { 349 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 350 if ($RootFileName && $RootFileExt) { 351 $FileName = $RootFileName; 352 } 353 else { 354 $FileName = $Options{root}; 355 } 356 $OutFileRoot = $FileName; 357 } 358 else { 359 $OutFileRoot = $FileName . "ModifiedDataFields"; 360 } 361 362 $OutFile = $OutFileRoot . ".$FileExt"; 363 if (lc($OutFile) eq lc($SDFile)) { 364 warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n"; 365 next FILELIST; 366 } 367 if (!$Options{overwrite}) { 368 if (-e $OutFile) { 369 warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n"; 370 next FILELIST; 371 } 372 } 373 374 $SDFilesInfo{FileOkay}[$Index] = 1; 375 $SDFilesInfo{OutFile}[$Index] = $OutFile; 376 } 377 } 378 379 # Setup script usage and retrieve command line arguments specified using various options... 380 sub SetupScriptUsage { 381 382 # Retrieve all the options... 383 %Options = (); 384 $Options{detail} = 1; 385 $Options{keepolddatafields} = "none"; 386 $Options{mode} = "molname"; 387 $Options{molnamemode} = "labelprefix"; 388 $Options{molnamereplace} = "empty"; 389 390 if (!GetOptions(\%Options, "detail|d=i", "datafieldscommon=s", "datafieldsmap=s", "datafieldsmapfile=s", "datafieldurl=s", "help|h", "keepolddatafields|k=s", "mode|m=s", "molname=s", "molnamemode=s", "molnamereplace=s", "overwrite|o", "root|r=s", "workingdir|w=s")) { 391 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 392 } 393 if ($Options{workingdir}) { 394 if (! -d $Options{workingdir}) { 395 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 396 } 397 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 398 } 399 if ($Options{keepolddatafields} !~ /^(all|unmappedonly|none)$/i) { 400 die "Error: The value specified, $Options{keepolddatafields}, for option \"-k --keepolddatafields\" is not valid. Allowed values: all, unmappedonly, or none\n"; 401 } 402 if ($Options{mode} !~ /^(molname|datafields|both)$/i) { 403 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molname, datafields, or both\n"; 404 } 405 if ($Options{molnamemode} !~ /^(datafield|labelprefix)$/i) { 406 die "Error: The value specified, $Options{molnamemode}, for option \"--molnamemode\" is not valid. Allowed values: datafield or labelprefix\n"; 407 } 408 if ($Options{molnamereplace} !~ /^(always|empty)$/i) { 409 die "Error: The value specified, $Options{molnamereplace}, for option \"--molnamereplace\" is not valid. Allowed values: always or empty\n"; 410 } 411 if (!IsPositiveInteger($Options{detail})) { 412 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 413 } 414 } 415