1 #!/usr/bin/perl -w 2 # 3 # File: MolFilesToSD.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use SDFileUtil; 33 use FileUtil; 34 use TextUtil; 35 36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 37 38 # Autoflush STDOUT 39 $| = 1; 40 41 # Starting message... 42 $ScriptName = basename $0; 43 print "\n$ScriptName:Starting...\n\n"; 44 $StartTime = new Benchmark; 45 46 # Get the options and setup script... 47 SetupScriptUsage(); 48 if ($Options{help} || @ARGV < 1) { 49 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 50 } 51 52 my(@MOLFilesList); 53 @MOLFilesList = ExpandFileNames(\@ARGV, "mol"); 54 55 # Process options... 56 print "Processing options...\n"; 57 my(%OptionsInfo); 58 ProcessOptions(); 59 60 print "Generating SD file $OptionsInfo{SDFile}...\n"; 61 GenerateSDFile(); 62 63 print "\n$ScriptName:Done...\n\n"; 64 65 $EndTime = new Benchmark; 66 $TotalTime = timediff ($EndTime, $StartTime); 67 print "Total time: ", timestr($TotalTime), "\n"; 68 69 ############################################################################### 70 71 # Generate a SD file using all valid MDL MOL files... 72 sub GenerateSDFile { 73 my($MOLFile, $Index, $FileCount, $FileOkayCount, $MolNameLine, $CmpdID, $FileDir, $FileName, $FileExt); 74 75 open SDFILE, ">$OptionsInfo{SDFile}" or die "Error: Can't open $OptionsInfo{SDFile}: $! \n"; 76 $FileCount = 0; 77 $FileOkayCount = 0; 78 79 FILELIST: for $Index (0 .. $#MOLFilesList) { 80 $MOLFile = $MOLFilesList[$Index]; 81 $FileCount++; 82 83 print "Processing file $MOLFile...\n"; 84 85 if (!(-e $MOLFile)) { 86 warn "Warning: Ignoring file $MOLFile: It doesn't exist\n"; 87 next FILELIST; 88 } 89 90 if (!CheckFileType($MOLFile, "mol")) { 91 warn "Warning: Ignoring file $MOLFile: It's not a MDLMOL file\n"; 92 next FILELIST; 93 } 94 95 if (!open MOLFILE, "$MOLFile") { 96 warn "Warning: Ignoring file $MOLFile: Couldn't open it: $! \n"; 97 next FILELIST; 98 } 99 100 $FileOkayCount++; 101 102 if ($OptionsInfo{ModifyData}) { 103 $MolNameLine = <MOLFILE>; 104 if ($OptionsInfo{UseFilePrefix}) { 105 ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFile); 106 $CmpdID = $FileName; 107 } 108 else { 109 $CmpdID = $OptionsInfo{CompoundID} . "$FileCount"; 110 } 111 112 if ($OptionsInfo{AddMolNameLine}) { 113 print SDFILE "$CmpdID\n"; 114 } 115 else { 116 $MolNameLine =~ s/(\r\n)|(\r)/\n/g; 117 print SDFILE $MolNameLine; 118 } 119 120 while (<MOLFILE>) { 121 s/(\r\n)|(\r)/\n/g; 122 print SDFILE; 123 } 124 125 if ($OptionsInfo{AddDataField}) { 126 print SDFILE "> <$OptionsInfo{DataFieldLabel}>\n${CmpdID}\n"; 127 } 128 } 129 else { 130 while (<MOLFILE>) { 131 s/(\r\n)|(\r)/\n/g; 132 print SDFILE; 133 } 134 } 135 print SDFILE "\n\$\$\$\$\n"; 136 close MOLFILE; 137 } 138 close SDFILE; 139 140 print "\nNumber of files: $FileCount\n"; 141 print "Number of files processed successfully: $FileOkayCount\n"; 142 print "Number of files ignored: " . ($FileCount - $FileOkayCount) . "\n"; 143 } 144 145 # Process option values... 146 sub ProcessOptions { 147 %OptionsInfo = (); 148 149 $OptionsInfo{Mode} = $Options{mode}; 150 151 $OptionsInfo{CompoundID} = $Options{compoundid}; 152 $OptionsInfo{DataFieldLabel} = $Options{datafieldlabel}; 153 154 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; 155 $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef; 156 157 $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0; 158 $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0; 159 160 $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0; 161 $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0; 162 163 $OptionsInfo{ModifyData} = ($OptionsInfo{AddMolNameLine} || $OptionsInfo{AddDataField}) ? 1 : 0; 164 165 $OptionsInfo{UseFilePrefix} = ($Options{compoundid} =~ /^usefileprefix$/i) ? 1 : 0; 166 167 # Setup SD file name... 168 my($FileDir, $FileName, $FileExt, $SDFile); 169 if ($Options{root}) { 170 $FileDir = ""; $FileName = ""; $FileExt = ""; 171 ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root}); 172 if ($FileName && $FileExt) { 173 $SDFile = $FileName; 174 } 175 else { 176 $SDFile = $Options{root}; 177 } 178 $SDFile .= ".sdf"; 179 } 180 else { 181 $FileDir = ""; $FileName = ""; $FileExt = ""; 182 ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFilesList[0]); 183 $SDFile = $FileName . "1To" . @MOLFilesList . ".sdf"; 184 } 185 186 if (!$Options{overwrite}) { 187 if (-e $SDFile) { 188 die "Error: The file $SDFile already exists.\n"; 189 } 190 } 191 $OptionsInfo{SDFile} = $SDFile; 192 193 } 194 195 # Setup script usage and retrieve command line arguments specified using various options... 196 sub SetupScriptUsage { 197 198 # Retrieve all the options... 199 %Options = (); 200 $Options{compoundid} = "Cmpd"; 201 $Options{datafieldlabel} = "Cmpd_ID"; 202 $Options{mode} = "none"; 203 204 if (!GetOptions(\%Options, "compoundid|c=s", "datafieldlabel|d=s", "help|h", "mode|m=s", "overwrite|o", "root|r=s", "workingdir|w=s")) { 205 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 206 } 207 if ($Options{workingdir}) { 208 if (! -d $Options{workingdir}) { 209 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 210 } 211 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 212 } 213 if ($Options{mode} !~ /^(molnameline|datafield|both|none)$/i ) { 214 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molnameline, datafield, both, or none\n"; 215 } 216 } 217