MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: SplitSDFiles.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Benchmark;
  31 use SDFileUtil;
  32 use FileUtil;
  33 
  34 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  35 
  36 # Autoflush STDOUT
  37 $| = 1;
  38 
  39 # Starting message...
  40 $ScriptName = basename $0;
  41 print "\n$ScriptName:Starting...\n\n";
  42 $StartTime = new Benchmark;
  43 
  44 # Get the options and setup script...
  45 SetupScriptUsage();
  46 if ($Options{help} || @ARGV < 1) {
  47   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  48 }
  49 
  50 my(@SDFilesList);
  51 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  52 
  53 # Process options...
  54 print "Processing options...\n";
  55 my(%OptionsInfo);
  56 ProcessOptions();
  57 
  58 # Setup information about input files...
  59 my(%SDFilesInfo);
  60 print "Checking input SD file(s)...\n";
  61 RetrieveSDFilesInfo();
  62 
  63 # Process input files..
  64 my($FileIndex);
  65 if (@SDFilesList > 1) {
  66   print "\nProcessing SD files...\n";
  67 }
  68 for $FileIndex (0 .. $#SDFilesList) {
  69   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  70     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  71     SplitSDFile($FileIndex);
  72   }
  73 }
  74 print "\n$ScriptName:Done...\n\n";
  75 
  76 $EndTime = new Benchmark;
  77 $TotalTime = timediff ($EndTime, $StartTime);
  78 print "Total time: ", timestr($TotalTime), "\n";
  79 
  80 ###############################################################################
  81 
  82 # Split a SD file...
  83 #
  84 sub SplitSDFile {
  85   my($FileIndex) = @_;
  86 
  87   if ($OptionsInfo{Mode} =~ /^Files$/i) {
  88     SplitSDFileByNumOfFiles($FileIndex);
  89   }
  90   elsif ($OptionsInfo{Mode} =~ /^Cmpds$/i) {
  91     SplitSDFileByNumOfCmpds($FileIndex);
  92   }
  93 }
  94 
  95 # Split SD into specified number of files...
  96 #
  97 sub SplitSDFileByNumOfFiles {
  98   my($FileIndex) = @_;
  99   my($SDFile, $CmpdCount, $MaxCmpdsPerFile, $MaxNumOfFiles);
 100 
 101   $SDFile = $SDFilesList[$FileIndex];
 102 
 103   if (!open SDFILE, "$SDFile") {
 104     warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 105     return;
 106   }
 107 
 108   $MaxNumOfFiles = $OptionsInfo{NumOfFiles};
 109 
 110   # Count number of compounds to figure out maximum number of compound per file...
 111   $CmpdCount = 0;
 112   while (<SDFILE>) {
 113     if (/^\$\$\$\$/) {
 114       $CmpdCount++;
 115     }
 116   }
 117   close SDFILE;
 118 
 119   if ($CmpdCount < $MaxNumOfFiles) {
 120     warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is smaller than number of new files, $MaxNumOfFiles\n";
 121     return;
 122   }
 123 
 124   $MaxCmpdsPerFile = int $CmpdCount / $MaxNumOfFiles;
 125 
 126   SplitSDFileByNumOfFilesAndCmpds($FileIndex, $MaxNumOfFiles, $MaxCmpdsPerFile);
 127 }
 128 
 129 # Split SD into files containing specified number of compounds...
 130 #
 131 sub SplitSDFileByNumOfCmpds {
 132   my($FileIndex) = @_;
 133 
 134   if ($OptionsInfo{NumOfCmpds} == 1) {
 135     SplitSDFileByOneCmpdPerFile($FileIndex);
 136   }
 137   else {
 138     SplitSDFileByNumOfCmpdsPerFile($FileIndex);
 139   }
 140 }
 141 
 142 # Split SD into files containing one compound per file...
 143 #
 144 sub SplitSDFileByOneCmpdPerFile {
 145   my($FileIndex) = @_;
 146   my($SDFile, $NewSDFile, $NewSDFileRoot, $FileExt, $OutFileRoot, $OverwriteFiles, $UseDataField, $DataFieldName, $UseMolName, $CmpdCount, $CmpdString, @CmpdLines, %DataFieldValues);
 147 
 148   $SDFile = $SDFilesList[$FileIndex];
 149 
 150   if (!open SDFILE, "$SDFile") {
 151     warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 152     return;
 153   }
 154 
 155   print "\n";
 156 
 157   $CmpdCount = 0;
 158 
 159   $FileExt = $SDFilesInfo{FileExt}[$FileIndex];
 160 
 161   $OutFileRoot = $SDFilesInfo{OutFileRoot}[$FileIndex];
 162   $OverwriteFiles = $OptionsInfo{OverwriteFiles};
 163 
 164   $UseDataField = ($OptionsInfo{CmpdsMode} =~ /^DataField$/i) ? 1 : 0;
 165   $DataFieldName = $OptionsInfo{DataField};
 166 
 167   $UseMolName = ($OptionsInfo{CmpdsMode} =~ /^MolName$/i) ? 1 : 0;
 168 
 169   CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 170     $CmpdCount++;
 171 
 172     # Setup SD file name...
 173     $NewSDFileRoot = '';
 174     if ($UseDataField) {
 175       @CmpdLines = split "\n", $CmpdString;
 176       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 177       if (exists $DataFieldValues{$DataFieldName}) {
 178         $NewSDFileRoot = $DataFieldValues{$DataFieldName};
 179       }
 180     }
 181     elsif ($UseMolName) {
 182       @CmpdLines = split "\n", $CmpdString;
 183       $NewSDFileRoot = $CmpdLines[0];
 184     }
 185 
 186     # Check for any invalid file name characters in data field or molname values...
 187     if ($NewSDFileRoot && $NewSDFileRoot =~ /[^a-zA-Z0-9_]/) {
 188       $NewSDFileRoot =~ s/[^a-zA-Z0-9_]//g;
 189     }
 190 
 191     # Fall back plan for SD file name...
 192     if (!$NewSDFileRoot) {
 193       $NewSDFileRoot = "${OutFileRoot}Cmpd${CmpdCount}";
 194     }
 195 
 196     $NewSDFile = "${NewSDFileRoot}.${FileExt}";
 197 
 198     if (!$OverwriteFiles) {
 199       if (-e $NewSDFile) {
 200         warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: New SD file, $NewSDFile, already exists\n";
 201         next CMPDSTRING;
 202       }
 203     }
 204 
 205     # Write out new SD file...
 206 
 207     print "Generating $NewSDFile file\n";
 208     open NEWSDFILE, ">$NewSDFile" or die "Error: Can't open $NewSDFile: $! \n";
 209     print NEWSDFILE "$CmpdString\n";
 210     close NEWSDFILE;
 211 
 212   }
 213   close SDFILE;
 214 }
 215 
 216 # Split SD into files containing specified number of compounds per file...
 217 #
 218 sub SplitSDFileByNumOfCmpdsPerFile {
 219   my($FileIndex) = @_;
 220   my($SDFile, $CmpdCount, $MaxCmpdsPerFile, $MaxNumOfFiles);
 221 
 222   $SDFile = $SDFilesList[$FileIndex];
 223 
 224   if (!open SDFILE, "$SDFile") {
 225     warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 226     return;
 227   }
 228 
 229   $MaxCmpdsPerFile = $OptionsInfo{NumOfCmpds};
 230 
 231   # Count number of compounds to figure out maximum number of files...
 232   $CmpdCount = 0;
 233   while (<SDFILE>) {
 234     if (/^\$\$\$\$/) {
 235       $CmpdCount++;
 236     }
 237   }
 238   close SDFILE;
 239 
 240   $MaxNumOfFiles = int $CmpdCount / $MaxCmpdsPerFile;
 241 
 242   if (($MaxNumOfFiles * $MaxCmpdsPerFile) < $CmpdCount) {
 243     $MaxNumOfFiles++;
 244   }
 245 
 246   if ($CmpdCount <= $MaxCmpdsPerFile) {
 247     warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is <= specified number of compunds per file, $MaxCmpdsPerFile\n";
 248     return;
 249   }
 250 
 251   SplitSDFileByNumOfFilesAndCmpds($FileIndex, $MaxNumOfFiles, $MaxCmpdsPerFile);
 252 }
 253 
 254 # Split SD files into specified number of files with specified number of compounds
 255 # in each file...
 256 #
 257 sub SplitSDFileByNumOfFilesAndCmpds {
 258   my($FileIndex, $NumOfFiles, $NumOfCmpdsPerFile) = @_;
 259   my($SDFile, $CmpdCount, $NewFileIndex, $NewFileName, $MaxCmpdsCount, @NewSDFilesList);
 260 
 261   $SDFile = $SDFilesList[$FileIndex];
 262 
 263   if (!open SDFILE, "$SDFile") {
 264     warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 265     return;
 266   }
 267 
 268   # Setup new file names list...
 269   @NewSDFilesList = ();
 270   for $NewFileIndex (1 .. $NumOfFiles) {
 271     $NewFileName = $SDFilesInfo{OutFileRoot}[$FileIndex] . "Part${NewFileIndex}." . $SDFilesInfo{FileExt}[$FileIndex];
 272     if (!$OptionsInfo{OverwriteFiles}) {
 273       if (-e $NewFileName) {
 274         warn "Warning: Ignoring file $SDFile: New SD file, $NewFileName, already exists\n";
 275         return;
 276       }
 277     }
 278     push @NewSDFilesList, $NewFileName;
 279   }
 280 
 281   $MaxCmpdsCount = $NumOfCmpdsPerFile;
 282 
 283   $CmpdCount = 0;
 284   $NewFileIndex = 1;
 285 
 286   open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex -1]: $! \n";
 287   print "\nGenerating $NewSDFilesList[$NewFileIndex - 1] file\n";
 288 
 289   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
 290 
 291   while (<SDFILE>) {
 292     s/(\r\n)|(\r)/\n/g;
 293     print NEWSDFILE;
 294 
 295     if ( /^\$\$\$\$/ ) {
 296       $CmpdCount++;
 297       if ($NewFileIndex <= $NumOfFiles) {
 298         if ($CmpdCount >= $MaxCmpdsCount) {
 299           if ($NewFileIndex < $NumOfFiles) {
 300             close NEWSDFILE;
 301           }
 302           $NewFileIndex++;
 303           $MaxCmpdsCount = $NumOfCmpdsPerFile * $NewFileIndex;
 304 
 305           if ($NewFileIndex <= $NumOfFiles) {
 306             open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex - 1]: $! \n";
 307             print "Generating $NewSDFilesList[$NewFileIndex - 1] file...\n";
 308           }
 309         }
 310       }
 311     }
 312   }
 313   close NEWSDFILE;
 314 }
 315 
 316 # Retrieve information about SD files...
 317 #
 318 sub RetrieveSDFilesInfo {
 319   my($SDFile, $Index, $FileDir, $FileName, $FileExt, $OutFileRoot);
 320 
 321   %SDFilesInfo = ();
 322   @{$SDFilesInfo{FileOkay}} = ();
 323   @{$SDFilesInfo{FileExt}} = ();
 324   @{$SDFilesInfo{OutFileRoot}} = ();
 325 
 326   FILELIST: for $Index (0 .. $#SDFilesList) {
 327     $SDFile = $SDFilesList[$Index];
 328 
 329     $SDFilesInfo{FileOkay}[$Index] = 0;
 330     $SDFilesInfo{FileExt}[$Index] = '';
 331     $SDFilesInfo{OutFileRoot}[$Index] = '';
 332 
 333     $SDFile = $SDFilesList[$Index];
 334     if (!(-e $SDFile)) {
 335       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 336       next FILELIST;
 337     }
 338     if (!CheckFileType($SDFile, "sd sdf")) {
 339       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 340       next FILELIST;
 341     }
 342 
 343     # Setup output file root...
 344     $FileDir = ""; $FileName = ""; $FileExt = "";
 345     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 346 
 347     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 348       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 349       if ($RootFileName && $RootFileExt) {
 350         $FileName = $RootFileName;
 351       }
 352       else {
 353         $FileName = $OptionsInfo{OutFileRoot};
 354       }
 355       $OutFileRoot = $FileName;
 356     }
 357     else {
 358       $OutFileRoot = "$FileName";
 359     }
 360 
 361     $SDFilesInfo{FileOkay}[$Index] = 1;
 362     $SDFilesInfo{FileExt}[$Index] = $FileExt;
 363     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 364   }
 365 }
 366 
 367 # Process option values...
 368 sub ProcessOptions {
 369   %OptionsInfo = ();
 370 
 371   $OptionsInfo{Mode} = $Options{mode};
 372 
 373   $OptionsInfo{CmpdsMode} = $Options{cmpdsmode};
 374 
 375   $OptionsInfo{NumOfFiles} = $Options{numfiles};
 376   $OptionsInfo{NumOfCmpds} = $Options{numcmpds};
 377 
 378   $OptionsInfo{DataField} = '';
 379   if ($Options{mode} =~ /^Cmpds$/i && $Options{cmpdsmode} =~ /^DataField$/i) {
 380     if (!$Options{datafield}) {
 381       die "Error: You must specify a value for \"-d, --DataField\" option in \"DataField\" value of \"-c, --CmpdsMode\" during \"Cmpds\" \"-m, --mode\" value. \n";
 382     }
 383     $OptionsInfo{DataField} = $Options{datafield};
 384   }
 385 
 386   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 387 
 388   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 389 }
 390 
 391 
 392 # Setup script usage  and retrieve command line arguments specified using various options...
 393 sub SetupScriptUsage {
 394 
 395   # Retrieve all the options...
 396   %Options = ();
 397 
 398   $Options{cmpdsmode} = 'RootPrefix';
 399   $Options{mode} = 'Files';
 400 
 401   $Options{numfiles} = 2;
 402   $Options{numcmpds} = 1;
 403 
 404 
 405   if (!GetOptions(\%Options, "cmpdsmode|c=s", "datafield|d=s", "help|h", "mode|m=s", "numfiles|n=i", "numcmpds=i", "overwrite|o", "root|r=s", "workingdir|w=s")) {
 406     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 407   }
 408   if ($Options{workingdir}) {
 409     if (! -d $Options{workingdir}) {
 410       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 411     }
 412     chdir $Options{workingdir} or die "Error: Error: Couldn't chdir $Options{workingdir}: $! \n";
 413   }
 414   if ($Options{cmpdsmode} !~ /^(DataField|MolName|RootPrefix)$/i) {
 415     die "Error: The value specified, $Options{cmpdsmode}, for option \"-c, --CmpdsMode\" is not valid. Allowed values: DataField, MolName, RootPrefix\n";
 416   }
 417   if ($Options{mode} !~ /^(Cmpds|Files)$/i) {
 418     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: Cmpds, Files\n";
 419   }
 420   if ($Options{numfiles} < 2) {
 421     die "Error: The value specified, $Options{numfiles}, for option \"-n --numfiles\" is not valid. Allowed values: >= 2 \n";
 422   }
 423   if ($Options{numcmpds} < 1) {
 424     die "Error: The value specified, $Options{numcmpds}, for option \"-n --numcmpds\" is not valid. Allowed values: >= 1 \n";
 425   }
 426 }
 427