MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: SortSDFiles.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use SDFileUtil;
  34 use TextUtil;
  35 
  36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  37 
  38 # Autoflush STDOUT
  39 $| = 1;
  40 
  41 # Starting message...
  42 $ScriptName = basename($0);
  43 print "\n$ScriptName: Starting...\n\n";
  44 $StartTime = new Benchmark;
  45 
  46 # Get the options and setup script...
  47 SetupScriptUsage();
  48 if ($Options{help} || @ARGV < 1) {
  49   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  50 }
  51 
  52 my(@SDFilesList);
  53 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  54 
  55 print "Processing options...\n";
  56 my(%OptionsInfo);
  57 ProcessOptions();
  58 
  59 print "Checking input SD file(s)...\n";
  60 my(%SDFilesInfo);
  61 RetrieveSDFilesInfo();
  62 
  63 # Generate output files...
  64 my($FileIndex);
  65 if (@SDFilesList > 1) {
  66   print "\nProcessing SD files...\n";
  67 }
  68 for $FileIndex (0 .. $#SDFilesList) {
  69   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  70     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  71     SortSDFile($FileIndex);
  72   }
  73 }
  74 print "\n$ScriptName:Done...\n\n";
  75 
  76 $EndTime = new Benchmark;
  77 $TotalTime = timediff ($EndTime, $StartTime);
  78 print "Total time: ", timestr($TotalTime), "\n";
  79 
  80 ###############################################################################
  81 
  82 # Sort it out...
  83 sub SortSDFile {
  84   my($Index) = @_;
  85   my($SDFile, $NewSDFile, $KeyDataFieldName);
  86 
  87   $SDFile = $SDFilesList[$Index];
  88   $NewSDFile = $SDFilesInfo{OutFile}[$Index];
  89   $KeyDataFieldName = $SDFilesInfo{KeyDataFieldName}[$Index];
  90 
  91   print "Generating new SD file $NewSDFile...\n";
  92   open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
  93   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
  94 
  95   # Go over all compound records and store 'em using key value as hash...
  96   my(%KeyToCompundRecordsMap, @InvalidCompoundRecords, $CmpdCount, $CmpdString, @CmpdLines, %DataFieldValues, $KeyDataFieldValue);
  97   %KeyToCompundRecordsMap = ();
  98   @InvalidCompoundRecords = ();
  99   $CmpdCount = 0;
 100 
 101   COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 102       $CmpdCount++;
 103       @CmpdLines = split "\n", $CmpdString;
 104       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 105       #Make sure data field value is okay...
 106       if (!(IsNotEmpty($DataFieldValues{$KeyDataFieldName}) && ($DataFieldValues{$KeyDataFieldName} !~ /\n/))) {
 107         push @InvalidCompoundRecords, $CmpdString;
 108         if ($OptionsInfo{DetailLevel} >= 3 ) {
 109           print "Ignoring compound record $CmpdCount: Contains empty value for key data field $KeyDataFieldName :\n $CmpdString\n\n";
 110         }
 111         elsif ($OptionsInfo{DetailLevel} >= 2) {
 112           print "Ignoring compound record $CmpdCount: Contains empty value for key data field $KeyDataFieldName...\n";
 113         }
 114         next COMPOUND;
 115       }
 116       $KeyDataFieldValue = $DataFieldValues{$KeyDataFieldName};
 117       if ($OptionsInfo{KeyData} =~ /^numeric$/i) {
 118         if (!IsFloat($KeyDataFieldValue)) {
 119           push @InvalidCompoundRecords, $CmpdString;
 120           if ($OptionsInfo{DetailLevel} >= 3 ) {
 121             print "Ignoring compound record $CmpdCount: Contains non-numerical value for key data field $KeyDataFieldName :\n $CmpdString\n\n";
 122           }
 123           elsif ($OptionsInfo{DetailLevel} >= 2) {
 124             print "Ignoring compound record $CmpdCount: Contains non-numerical value for key data field $KeyDataFieldName...\n";
 125           }
 126           next COMPOUND;
 127         }
 128       }
 129       if (exists($KeyToCompundRecordsMap{$KeyDataFieldValue})) {
 130         # Append to existing coompund data...
 131         $KeyToCompundRecordsMap{$KeyDataFieldValue} .= "\n" . $CmpdString;
 132       }
 133       else {
 134         $KeyToCompundRecordsMap{$KeyDataFieldValue} = $CmpdString;
 135       }
 136   }
 137 
 138   if ($OptionsInfo{Sort} =~ /^ascending$/i) {
 139     if ($OptionsInfo{KeyData} =~ /^alphanumeric$/i) {
 140       for $KeyDataFieldValue (sort { lc($a) cmp lc($b) } keys %KeyToCompundRecordsMap ) {
 141         print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n";
 142       }
 143     }
 144     else {
 145       for $KeyDataFieldValue (sort { $a <=> $b } keys %KeyToCompundRecordsMap ) {
 146         print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n";
 147       }
 148     }
 149   }
 150   else {
 151     if ($OptionsInfo{KeyData} =~ /^alphanumeric$/i) {
 152       for $KeyDataFieldValue (sort { lc($b) cmp lc($a) } keys %KeyToCompundRecordsMap ) {
 153         print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n";
 154       }
 155     }
 156     else {
 157       for $KeyDataFieldValue (sort { $b <=> $a } keys %KeyToCompundRecordsMap ) {
 158         print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n";
 159       }
 160     }
 161   }
 162   # Append the records containing data not appropriate for sorting...
 163   if (@InvalidCompoundRecords) {
 164     print "Placing ", scalar(@InvalidCompoundRecords)," compound record(s) with invalid data field key data the end...\n";
 165     for $CmpdString (@InvalidCompoundRecords) {
 166       print NEWSDFILE "$CmpdString\n";
 167     }
 168   }
 169   close NEWSDFILE;
 170   close SDFILE;
 171 }
 172 
 173 # Retrieve information about input SD files...
 174 sub RetrieveSDFilesInfo {
 175   my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot,  $OutFile, $DataFieldName);
 176 
 177   %SDFilesInfo = ();
 178 
 179   @{$SDFilesInfo{FileOkay}} = ();
 180   @{$SDFilesInfo{OutFile}} = ();
 181   @{$SDFilesInfo{KeyDataFieldName}} = ();
 182 
 183   FILELIST: for $Index (0 .. $#SDFilesList) {
 184     $SDFile = $SDFilesList[$Index];
 185     $SDFilesInfo{FileOkay}[$Index] = 0;
 186     $SDFilesInfo{OutFile}[$Index] = "";
 187     $SDFilesInfo{KeyDataFieldName}[$Index] = "";
 188 
 189     if (!(-e $SDFile)) {
 190       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 191       next FILELIST;
 192     }
 193     if (!CheckFileType($SDFile, "sd sdf")) {
 194       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 195       next FILELIST;
 196     }
 197     $FileDir = ""; $FileName = ""; $FileExt = "";
 198     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 199     if ($Options{root} && (@SDFilesList == 1)) {
 200       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 201       if ($RootFileName && $RootFileExt) {
 202         $FileName = $RootFileName;
 203       }
 204       else {
 205         $FileName = $Options{root};
 206       }
 207       $OutFileRoot = $FileName;
 208     }
 209     else {
 210       $OutFileRoot = $FileName . "SortedByDataField";
 211     }
 212 
 213     $OutFile = $OutFileRoot . ".$FileExt";
 214     if (lc($OutFile) eq lc($SDFile)) {
 215       warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n";
 216       next FILELIST;
 217     }
 218     if (!$Options{overwrite}) {
 219       if (-e $OutFile) {
 220         warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n";
 221         next FILELIST;
 222       }
 223     }
 224     # Setup data field name...
 225     if ($OptionsInfo{SpecifiedDataFieldName}) {
 226       $DataFieldName = $OptionsInfo{SpecifiedDataFieldName};
 227     }
 228     else {
 229       my($CmpdString, @CmpdLines, @DataFieldNames);
 230       @DataFieldNames = ();
 231       if (!open(SDFILE, "$SDFile")) {
 232         warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 233         next FILELIST;
 234       }
 235       $CmpdString = ReadCmpdString(\*SDFILE);
 236       close SDFILE;
 237 
 238       @CmpdLines = split "\n", $CmpdString;
 239       @DataFieldNames = GetCmpdDataHeaderLabels(\@CmpdLines);
 240       $DataFieldName = $DataFieldNames[0];
 241     }
 242 
 243     $SDFilesInfo{FileOkay}[$Index] = 1;
 244     $SDFilesInfo{OutFile}[$Index] = "$OutFile";
 245     $SDFilesInfo{KeyDataFieldName}[$Index] = $DataFieldName;
 246   }
 247 }
 248 
 249 # Process option values...
 250 sub ProcessOptions {
 251   $OptionsInfo{DetailLevel} = $Options{detail};
 252 
 253   $OptionsInfo{Key} = defined $Options{key} ? $Options{key} : undef;
 254   $OptionsInfo{SpecifiedDataFieldName} = "";
 255   if (defined $Options{key}) {
 256     $OptionsInfo{SpecifiedDataFieldName} = $Options{key};
 257   }
 258 
 259   $OptionsInfo{KeyData} = $Options{keydata};
 260   $OptionsInfo{Sort} = $Options{sort};
 261 
 262   $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
 263   $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef;
 264 }
 265 
 266 # Setup script usage  and retrieve command line arguments specified using various options...
 267 sub SetupScriptUsage {
 268 
 269   # Retrieve all the options...
 270   %Options = ();
 271   $Options{detail} = 1;
 272   $Options{sort} = "ascending";
 273   $Options{keydata} = "numeric";
 274   if (!GetOptions(\%Options, "detail|d=i", "help|h",  "key|k=s", "keydata=s", "overwrite|o", "root|r=s", "sort|s=s", "workingdir|w=s")) {
 275     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 276   }
 277   if ($Options{workingdir}) {
 278     if (! -d $Options{workingdir}) {
 279       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 280     }
 281     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 282   }
 283   if ($Options{keydata} !~ /^(numeric|alphanumeric)$/i) {
 284     die "Error: The value specified, $Options{keydata}, for option \"--keydata\" is not valid. Allowed values: numeric or alphanumeric\n";
 285   }
 286   if ($Options{sort} !~ /^(ascending|descending)$/i) {
 287     die "Error: The value specified, $Options{sort}, for option \"-s --sort\" is not valid. Allowed values: ascending or descending\n";
 288   }
 289   if (!IsPositiveInteger($Options{detail})) {
 290     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 291   }
 292 }
 293