MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: ModifySDFilesDataFields.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use SDFileUtil;
  34 use TextUtil;
  35 
  36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  37 
  38 # Autoflush STDOUT
  39 $| = 1;
  40 
  41 # Starting message...
  42 $ScriptName = basename($0);
  43 print "\n$ScriptName: Starting...\n\n";
  44 $StartTime = new Benchmark;
  45 
  46 # Get the options and setup script...
  47 SetupScriptUsage();
  48 if ($Options{help} || @ARGV < 1) {
  49   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  50 }
  51 
  52 my(@SDFilesList);
  53 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  54 
  55 # Process options...
  56 print "Processing options...\n";
  57 my(%OptionsInfo);
  58 ProcessOptions();
  59 
  60 print "Checking input SD file(s)...\n";
  61 my(%SDFilesInfo);
  62 RetrieveSDFilesInfo();
  63 
  64 # Generate output files...
  65 my($FileIndex);
  66 if (@SDFilesList > 1) {
  67   print "\nProcessing SD files...\n";
  68 }
  69 for $FileIndex (0 .. $#SDFilesList) {
  70   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  71     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  72     ModifySDFile($FileIndex);
  73   }
  74 }
  75 print "\n$ScriptName:Done...\n\n";
  76 
  77 $EndTime = new Benchmark;
  78 $TotalTime = timediff ($EndTime, $StartTime);
  79 print "Total time: ", timestr($TotalTime), "\n";
  80 
  81 ###############################################################################
  82 
  83 # Modify SD file data fields....
  84 sub ModifySDFile {
  85   my($Index) = @_;
  86   my($SDFile, $NewSDFile);
  87 
  88   $SDFile = $SDFilesList[$Index];
  89   $NewSDFile = $SDFilesInfo{OutFile}[$Index];
  90 
  91   print "Generating new SD file $NewSDFile...\n";
  92   open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
  93   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
  94 
  95   my($CmpdCount, $CmpdString, $CmpdData, $MolName, $OldSDField, $NewSDField, $CommonSDField, $Label, $Value, $FieldValues, $MolNameDataField, $URLCmpdIdFieldName, @CmpdLines, %DataFieldAndValues, @DataFieldLabels);
  96   $CmpdCount = 0;
  97 
  98   COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
  99       $CmpdCount++;
 100       @CmpdLines = split "\n", $CmpdString;
 101       if ($OptionsInfo{UseDataFieldForMolName} || $OptionsInfo{ModifyDataFields}) {
 102         %DataFieldAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 103       }
 104       if ($OptionsInfo{ModifyMolName}) {
 105         if ($OptionsInfo{AlwaysReplaceMolName} || !IsNotEmpty($CmpdLines[0])) {
 106           $MolNameDataField = $OptionsInfo{MolNameDataField};
 107           if ($OptionsInfo{UseDataFieldForMolName} && exists($DataFieldAndValues{$MolNameDataField})) {
 108             $MolName = $DataFieldAndValues{$MolNameDataField};
 109             if (length($MolName) > 80) {
 110               $MolName = substr($MolName, 0, 80);
 111             }
 112           }
 113           else {
 114             $MolName = "$OptionsInfo{MolNamePrefix}${CmpdCount}";
 115           }
 116           $CmpdLines[0] = $MolName;
 117           $CmpdString = join "\n", @CmpdLines;
 118         }
 119       }
 120       if (!$OptionsInfo{ModifyDataFields}) {
 121         # Just write the data and get the next compound...
 122         print NEWSDFILE "$CmpdString\n";
 123         next COMPOUND;
 124       }
 125       # Write out the structure data now and handle the old data fields later...
 126       ($CmpdData) = split /\n>/, $CmpdString;
 127       print NEWSDFILE "$CmpdData\n";
 128 
 129       # Modify specified data fields...
 130       for $NewSDField (sort keys %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}}) {
 131         $FieldValues = "";
 132         for $OldSDField (@{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}) {
 133           if (exists($DataFieldAndValues{$OldSDField}) && length($DataFieldAndValues{$OldSDField})) {
 134             $Value = $DataFieldAndValues{$OldSDField};
 135             $FieldValues .= ($FieldValues) ? "\n$Value" : $Value;
 136           }
 137         }
 138         print NEWSDFILE "> <$NewSDField>\n$FieldValues\n\n";
 139       }
 140       # Add specified common fields...
 141       for $CommonSDField (sort keys %{$OptionsInfo{SpecifiedCommonFieldMap}}) {
 142         $Value = $OptionsInfo{SpecifiedCommonFieldMap}{$CommonSDField};
 143         print NEWSDFILE "> <$CommonSDField>\n$Value\n\n";
 144       }
 145       if ($OptionsInfo{CreateDataFieldURL}) {
 146         $Value = "";
 147         $URLCmpdIdFieldName = $OptionsInfo{URLCmpdIdFieldName};
 148         if (exists($DataFieldAndValues{$URLCmpdIdFieldName}) && length($DataFieldAndValues{$URLCmpdIdFieldName})) {
 149           $Value = $DataFieldAndValues{$URLCmpdIdFieldName};
 150           $Value = "$OptionsInfo{URLCGIScriptName}?$OptionsInfo{URLParamName}=${Value}";
 151         }
 152         print NEWSDFILE "> <$OptionsInfo{URLDataFieldLabel}>\n$Value\n\n";
 153       }
 154 
 155       # Handle old data fields and write 'em in the same order as they appear in the input
 156       # files...
 157       if ($OptionsInfo{KeepAllOldDataFields} || $OptionsInfo{KeepUnMappedOldDataFields}) {
 158         my($KeepLabel);
 159         @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 160         LABEL: for $Label (@DataFieldLabels) {
 161           $KeepLabel = $OptionsInfo{KeepAllOldDataFields} ? 1 : ( exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$Label}) ? 0 : 1  );
 162           if (!$KeepLabel) {
 163             next LABEL;
 164           }
 165           $Value = $DataFieldAndValues{$Label};
 166           print NEWSDFILE "> <$Label>\n$Value\n\n";
 167         }
 168       }
 169 
 170       print NEWSDFILE "\$\$\$\$\n";
 171   }
 172   close NEWSDFILE;
 173   close SDFILE;
 174 }
 175 
 176 # Process option values...
 177 sub ProcessOptions {
 178   %OptionsInfo = ();
 179 
 180   $OptionsInfo{Mode} = $Options{mode};
 181 
 182   $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 0;
 183   if ($Options{mode} =~ /^both$/i) {
 184     $OptionsInfo{ModifyMolName} = 1; $OptionsInfo{ModifyDataFields} = 1;
 185   }
 186   elsif ($Options{mode} =~ /^datafields$/i) {
 187     $OptionsInfo{ModifyMolName} = 0; $OptionsInfo{ModifyDataFields} = 1;
 188   }
 189 
 190   $OptionsInfo{KeepOldDataFields} = $Options{keepolddatafields};
 191   $OptionsInfo{KeepAllOldDataFields} = ($Options{keepolddatafields} =~ /^all$/i) ? 1 : 0;
 192   $OptionsInfo{KeepUnMappedOldDataFields} = ($Options{keepolddatafields} =~ /^unmappedonly$/i) ? 1 : 0;
 193 
 194   $OptionsInfo{MolNameMode} = $Options{molnamemode};
 195   $OptionsInfo{UseDataFieldForMolName} = ($Options{molnamemode} =~ /^datafield$/i) ? 1 : 0;
 196 
 197   $OptionsInfo{MolName} = $Options{molname};
 198   $OptionsInfo{MolNameDataField} = ""; $OptionsInfo{MolNamePrefix} = "Cmpd";
 199   if ($Options{molname}) {
 200     if ($OptionsInfo{UseDataFieldForMolName}) {
 201       $OptionsInfo{MolNameDataField} = $Options{molname};
 202     }
 203     else {
 204       $OptionsInfo{MolNamePrefix} = $Options{molname};
 205     }
 206   }
 207 
 208   $OptionsInfo{MolNameReplace} = $Options{molnamereplace};
 209   $OptionsInfo{AlwaysReplaceMolName} = ($Options{molnamereplace} =~ /^always$/i) ? 1 : 0;
 210 
 211   if ($Options{datafieldsmap} && $Options{datafieldsmapfile}) {
 212     die "Error: Both \"--datafieldsmap\" and  \"--datafieldsmapfile\" options specified: only one is allowed at a time\n";
 213   }
 214 
 215   $OptionsInfo{DataFieldsMap} = $Options{datafieldsmap} ? $Options{datafieldsmap} : '';
 216   $OptionsInfo{DataFieldsMapFile} = $Options{datafieldsmapfile} ? $Options{datafieldsmapfile} : '';
 217 
 218   my($SpecifiedDataFieldMap);
 219 
 220   %{$OptionsInfo{SpecifiedNewToOldSDFieldMap}} = ();
 221   %{$OptionsInfo{SpecifiedOldToNewSDFieldMap}} = ();
 222 
 223   $SpecifiedDataFieldMap = "";
 224   if ($Options{datafieldsmap}) {
 225     $SpecifiedDataFieldMap = $Options{datafieldsmap};
 226   }
 227   elsif ($Options{datafieldsmapfile}) {
 228     my($Line, @LineWords);
 229     open DATAFIELDSFILE, "$Options{datafieldsmapfile}" or die "Couldn't  open $Options{datafieldsmapfile}: $! \n";
 230     while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
 231       @LineWords = quotewords(";", 0, $Line);
 232       $SpecifiedDataFieldMap .= JoinWords(\@LineWords, ";", 0);
 233     }
 234     close DATAFIELDSFILE;
 235   }
 236 
 237   if ($SpecifiedDataFieldMap) {
 238     my($DataFieldMap, $DataField, $NewSDField, @OldSDFields, @DataFieldMapSplit, @DataFieldsSplit, $FirstField);
 239     @DataFieldMapSplit = split ";", $SpecifiedDataFieldMap;
 240     for $DataFieldMap (@DataFieldMapSplit) {
 241       @DataFieldsSplit = split ",", $DataFieldMap;
 242       if (@DataFieldsSplit == 1) {
 243         die "Error: Invalid number of comma delimited values, ", scalar(@DataFieldsSplit), ", specified,  @DataFieldsSplit, using \"--datafieldsmap or --datafieldsmapfile\" option: it must contain more than one value.\n";
 244       }
 245       $FirstField = 1;
 246       @OldSDFields = ();
 247       for $DataField (@DataFieldsSplit) {
 248         if (!(defined($DataField) && length($DataField))) {
 249           die "Error: One of the comma delimited values, \"", join(",", @DataFieldsSplit), "\", specified using \"--datafieldsmap or --datafieldsmapfile\" option is empty.\n";
 250         }
 251         if ($FirstField) {
 252           $FirstField = 0;
 253           $NewSDField = $DataField;
 254         }
 255         else {
 256           push @OldSDFields, $DataField;
 257         }
 258       }
 259       # Make sure a datafield is only specified once...
 260       if (exists $OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}) {
 261         die "Error: New data field, $NewSDField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n";
 262       }
 263       @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}} = ();
 264       push @{$OptionsInfo{SpecifiedNewToOldSDFieldMap}{$NewSDField}}, @OldSDFields;
 265       for $DataField (@OldSDFields) {
 266         if (exists $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} ) {
 267           die "Error: SD field, $DataField, specified more than once using \"--datafieldsmap or --datafieldsmapfile\" option.\n";
 268         }
 269         else {
 270           $OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataField} = $NewSDField;
 271         }
 272       }
 273 
 274     }
 275   }
 276 
 277   $OptionsInfo{DataFieldsCommon} = $Options{datafieldscommon} ? $Options{datafieldscommon} : '';
 278   %{$OptionsInfo{SpecifiedCommonFieldMap}} = ();
 279 
 280   if ($Options{datafieldscommon}) {
 281     my($DataFieldName, $DataFieldValue, $Index, @CommonDataFieldsSplit);
 282     @CommonDataFieldsSplit = split ",", $Options{datafieldscommon};
 283     if (@CommonDataFieldsSplit % 2) {
 284         die "Error: Invalid number of comma delimited values, ", scalar(@CommonDataFieldsSplit), ", specified \"",  join(",", @CommonDataFieldsSplit), "\" using \"--datafieldscommon\" option: it must contain even number of values.\n";
 285     }
 286     for ($Index = 0; $Index < @CommonDataFieldsSplit; $Index += 2) {
 287       $DataFieldName = $CommonDataFieldsSplit[$Index];
 288       $DataFieldValue = $CommonDataFieldsSplit[$Index + 1];
 289       if (exists $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName}) {
 290         die "Error: Common data field, $DataFieldName, specified more than once using \"--datafieldscommon\" option.\n";
 291       }
 292       if (exists($OptionsInfo{SpecifiedNewToOldSDFieldMap}{$DataFieldName}) || exists($OptionsInfo{SpecifiedOldToNewSDFieldMap}{$DataFieldName})) {
 293         die "Error: Common data field, $DataFieldName, specified using \"--datafieldscommon\" option cannot be specified in \"--datafieldsmap or --datafieldsmapfile\" option.\n";
 294       }
 295       $OptionsInfo{SpecifiedCommonFieldMap}{$DataFieldName} = $DataFieldValue;
 296     }
 297   }
 298 
 299   $OptionsInfo{DataFieldURL} = $Options{datafieldurl} ? $Options{datafieldurl} : '';
 300   $OptionsInfo{CreateDataFieldURL} = (exists($Options{datafieldurl}) && length($Options{datafieldurl}) ) ? 1 : 0;
 301 
 302   $OptionsInfo{URLDataFieldLabel} = ""; $OptionsInfo{URLCGIScriptName} = "";
 303   $OptionsInfo{URLParamName} = ""; $OptionsInfo{URLCmpdIdFieldName} = "";
 304 
 305   if ($OptionsInfo{CreateDataFieldURL}) {
 306     my(@DataFieldURLSplit, $Value);
 307     @DataFieldURLSplit = split ",", $Options{datafieldurl};
 308     if (@DataFieldURLSplit != 4) {
 309       die "Error: Invalid number of values, ", scalar(@DataFieldURLSplit), ", specified using \"--datafieldURL\" option: it must contain 4 values.\n";
 310     }
 311     for $Value (@DataFieldURLSplit) {
 312       if (!IsNotEmpty($Value)) {
 313         die "Error: One of the values, $Options{datafieldurl}, specified using \"--datafieldURL\" option is empty.\n";
 314       }
 315     }
 316     $OptionsInfo{URLDataFieldLabel} = $DataFieldURLSplit[0];
 317     $OptionsInfo{URLCGIScriptName} = $DataFieldURLSplit[1];
 318     $OptionsInfo{URLParamName}  = $DataFieldURLSplit[2];
 319     $OptionsInfo{URLCmpdIdFieldName} = $DataFieldURLSplit[3];
 320   }
 321 
 322 }
 323 
 324 # Retrieve information about input SD files...
 325 sub RetrieveSDFilesInfo {
 326   my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot,  $OutFile, $DataFieldName);
 327 
 328   %SDFilesInfo = ();
 329   @{$SDFilesInfo{FileOkay}} = ();
 330   @{$SDFilesInfo{OutFile}} = ();
 331 
 332    FILELIST: for $Index (0 .. $#SDFilesList) {
 333     $SDFile = $SDFilesList[$Index];
 334 
 335     $SDFilesInfo{FileOkay}[$Index] = 0;
 336     $SDFilesInfo{OutFile}[$Index] = '';
 337 
 338     if (!(-e $SDFile)) {
 339       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 340       next FILELIST;
 341     }
 342     if (!CheckFileType($SDFile, "sd sdf")) {
 343       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 344       next FILELIST;
 345     }
 346     $FileDir = ""; $FileName = ""; $FileExt = "";
 347     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 348     if ($Options{root} && (@SDFilesList == 1)) {
 349       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 350       if ($RootFileName && $RootFileExt) {
 351         $FileName = $RootFileName;
 352       }
 353       else {
 354         $FileName = $Options{root};
 355       }
 356       $OutFileRoot = $FileName;
 357     }
 358     else {
 359       $OutFileRoot = $FileName . "ModifiedDataFields";
 360     }
 361 
 362     $OutFile = $OutFileRoot . ".$FileExt";
 363     if (lc($OutFile) eq lc($SDFile)) {
 364       warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n";
 365       next FILELIST;
 366     }
 367     if (!$Options{overwrite}) {
 368       if (-e $OutFile) {
 369         warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n";
 370         next FILELIST;
 371       }
 372     }
 373 
 374     $SDFilesInfo{FileOkay}[$Index] = 1;
 375     $SDFilesInfo{OutFile}[$Index] = $OutFile;
 376   }
 377 }
 378 
 379 # Setup script usage  and retrieve command line arguments specified using various options...
 380 sub SetupScriptUsage {
 381 
 382   # Retrieve all the options...
 383   %Options = ();
 384   $Options{detail} = 1;
 385   $Options{keepolddatafields} = "none";
 386   $Options{mode} = "molname";
 387   $Options{molnamemode} = "labelprefix";
 388   $Options{molnamereplace} = "empty";
 389 
 390   if (!GetOptions(\%Options, "detail|d=i", "datafieldscommon=s", "datafieldsmap=s", "datafieldsmapfile=s", "datafieldurl=s", "help|h", "keepolddatafields|k=s", "mode|m=s", "molname=s", "molnamemode=s", "molnamereplace=s", "overwrite|o", "root|r=s", "workingdir|w=s")) {
 391     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 392   }
 393   if ($Options{workingdir}) {
 394     if (! -d $Options{workingdir}) {
 395       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 396     }
 397     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 398   }
 399   if ($Options{keepolddatafields} !~ /^(all|unmappedonly|none)$/i) {
 400     die "Error: The value specified, $Options{keepolddatafields}, for option \"-k --keepolddatafields\" is not valid. Allowed values: all, unmappedonly, or none\n";
 401   }
 402   if ($Options{mode} !~ /^(molname|datafields|both)$/i) {
 403     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molname, datafields, or both\n";
 404   }
 405   if ($Options{molnamemode} !~ /^(datafield|labelprefix)$/i) {
 406     die "Error: The value specified, $Options{molnamemode}, for option \"--molnamemode\" is not valid. Allowed values: datafield or labelprefix\n";
 407   }
 408   if ($Options{molnamereplace} !~ /^(always|empty)$/i) {
 409     die "Error: The value specified, $Options{molnamereplace}, for option \"--molnamereplace\" is not valid. Allowed values: always or empty\n";
 410   }
 411   if (!IsPositiveInteger($Options{detail})) {
 412     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 413   }
 414 }
 415