MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: MolFilesToSD.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use SDFileUtil;
  33 use FileUtil;
  34 use TextUtil;
  35 
  36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  37 
  38 # Autoflush STDOUT
  39 $| = 1;
  40 
  41 # Starting message...
  42 $ScriptName = basename $0;
  43 print "\n$ScriptName:Starting...\n\n";
  44 $StartTime = new Benchmark;
  45 
  46 # Get the options and setup script...
  47 SetupScriptUsage();
  48 if ($Options{help} || @ARGV < 1) {
  49   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  50 }
  51 
  52 my(@MOLFilesList);
  53 @MOLFilesList = ExpandFileNames(\@ARGV, "mol");
  54 
  55 # Process options...
  56 print "Processing options...\n";
  57 my(%OptionsInfo);
  58 ProcessOptions();
  59 
  60 print "Generating SD file $OptionsInfo{SDFile}...\n";
  61 GenerateSDFile();
  62 
  63 print "\n$ScriptName:Done...\n\n";
  64 
  65 $EndTime = new Benchmark;
  66 $TotalTime = timediff ($EndTime, $StartTime);
  67 print "Total time: ", timestr($TotalTime), "\n";
  68 
  69 ###############################################################################
  70 
  71 # Generate a SD file using all valid MDL MOL files...
  72 sub GenerateSDFile {
  73   my($MOLFile, $Index, $FileCount, $FileOkayCount, $MolNameLine, $CmpdID, $FileDir, $FileName, $FileExt);
  74 
  75   open SDFILE, ">$OptionsInfo{SDFile}" or die "Error: Can't open $OptionsInfo{SDFile}: $! \n";
  76   $FileCount = 0;
  77   $FileOkayCount = 0;
  78 
  79   FILELIST: for $Index (0 .. $#MOLFilesList) {
  80     $MOLFile = $MOLFilesList[$Index];
  81     $FileCount++;
  82 
  83     print "Processing file $MOLFile...\n";
  84 
  85     if (!(-e $MOLFile)) {
  86       warn "Warning: Ignoring file $MOLFile: It doesn't exist\n";
  87       next FILELIST;
  88     }
  89 
  90     if (!CheckFileType($MOLFile, "mol")) {
  91       warn "Warning: Ignoring file $MOLFile: It's not a MDLMOL file\n";
  92       next FILELIST;
  93     }
  94 
  95     if (!open MOLFILE, "$MOLFile") {
  96       warn "Warning: Ignoring file $MOLFile: Couldn't open it: $! \n";
  97       next FILELIST;
  98     }
  99 
 100     $FileOkayCount++;
 101 
 102     if ($OptionsInfo{ModifyData}) {
 103       $MolNameLine = <MOLFILE>;
 104       if ($OptionsInfo{UseFilePrefix}) {
 105         ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFile);
 106         $CmpdID = $FileName;
 107       }
 108       else {
 109         $CmpdID = $OptionsInfo{CompoundID} . "$FileCount";
 110       }
 111 
 112       if ($OptionsInfo{AddMolNameLine}) {
 113         print SDFILE "$CmpdID\n";
 114       }
 115       else {
 116         $MolNameLine =~ s/(\r\n)|(\r)/\n/g;
 117         print SDFILE $MolNameLine;
 118       }
 119 
 120       while (<MOLFILE>) {
 121         s/(\r\n)|(\r)/\n/g;
 122         print SDFILE;
 123       }
 124 
 125       if ($OptionsInfo{AddDataField}) {
 126         print SDFILE ">  <$OptionsInfo{DataFieldLabel}>\n${CmpdID}\n";
 127       }
 128     }
 129     else {
 130       while (<MOLFILE>) {
 131         s/(\r\n)|(\r)/\n/g;
 132         print SDFILE;
 133       }
 134     }
 135     print SDFILE "\n\$\$\$\$\n";
 136     close MOLFILE;
 137   }
 138   close SDFILE;
 139 
 140   print "\nNumber of files: $FileCount\n";
 141   print "Number of files processed successfully: $FileOkayCount\n";
 142   print "Number of files ignored: " . ($FileCount - $FileOkayCount) . "\n";
 143 }
 144 
 145 # Process option values...
 146 sub ProcessOptions {
 147   %OptionsInfo = ();
 148 
 149   $OptionsInfo{Mode} = $Options{mode};
 150 
 151   $OptionsInfo{CompoundID} = $Options{compoundid};
 152   $OptionsInfo{DataFieldLabel} = $Options{datafieldlabel};
 153 
 154   $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
 155   $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef;
 156 
 157   $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0;
 158   $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0;
 159 
 160   $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0;
 161   $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0;
 162 
 163   $OptionsInfo{ModifyData} = ($OptionsInfo{AddMolNameLine} || $OptionsInfo{AddDataField}) ? 1 : 0;
 164 
 165   $OptionsInfo{UseFilePrefix} = ($Options{compoundid} =~ /^usefileprefix$/i) ? 1 : 0;
 166 
 167   # Setup SD file name...
 168   my($FileDir, $FileName, $FileExt, $SDFile);
 169   if ($Options{root}) {
 170     $FileDir = ""; $FileName = ""; $FileExt = "";
 171     ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root});
 172     if ($FileName && $FileExt) {
 173       $SDFile = $FileName;
 174     }
 175     else {
 176       $SDFile =  $Options{root};
 177     }
 178     $SDFile .=  ".sdf";
 179   }
 180   else {
 181     $FileDir = ""; $FileName = ""; $FileExt = "";
 182     ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFilesList[0]);
 183     $SDFile = $FileName . "1To" . @MOLFilesList . ".sdf";
 184   }
 185 
 186   if (!$Options{overwrite}) {
 187     if (-e $SDFile) {
 188       die "Error: The file $SDFile already exists.\n";
 189     }
 190   }
 191   $OptionsInfo{SDFile} = $SDFile;
 192 
 193 }
 194 
 195 # Setup script usage  and retrieve command line arguments specified using various options...
 196 sub SetupScriptUsage {
 197 
 198   # Retrieve all the options...
 199   %Options = ();
 200   $Options{compoundid} = "Cmpd";
 201   $Options{datafieldlabel} = "Cmpd_ID";
 202   $Options{mode} = "none";
 203 
 204   if (!GetOptions(\%Options, "compoundid|c=s", "datafieldlabel|d=s", "help|h", "mode|m=s", "overwrite|o", "root|r=s", "workingdir|w=s")) {
 205     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 206   }
 207   if ($Options{workingdir}) {
 208     if (! -d $Options{workingdir}) {
 209       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 210     }
 211     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 212   }
 213   if ($Options{mode} !~ /^(molnameline|datafield|both|none)$/i ) {
 214     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molnameline, datafield, both, or none\n";
 215   }
 216 }
 217