1 #!/usr/bin/perl -w 2 # 3 # File: SDToMolFiles.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use SDFileUtil; 33 use FileUtil; 34 35 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 36 37 # Autoflush STDOUT 38 $| = 1; 39 40 # Starting message... 41 $ScriptName = basename $0; 42 print "\n$ScriptName:Starting...\n\n"; 43 $StartTime = new Benchmark; 44 45 # Get the options and setup script... 46 SetupScriptUsage(); 47 if ($Options{help} || @ARGV < 1) { 48 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 49 } 50 51 my(@SDFilesList); 52 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 53 54 # Process options... 55 print "Processing options...\n"; 56 my(%OptionsInfo); 57 ProcessOptions(); 58 59 # Setup information about input files... 60 print "Checking input SD file(s)...\n"; 61 my(%SDFilesInfo); 62 RetrieveSDFilesInfo(); 63 64 # Process input files.. 65 my($FileIndex); 66 if (@SDFilesList > 1) { 67 print "\nProcessing SD files...\n"; 68 } 69 for $FileIndex (0 .. $#SDFilesList) { 70 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 71 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 72 GenerateMolFiles($FileIndex); 73 } 74 } 75 print "\n$ScriptName:Done...\n\n"; 76 77 $EndTime = new Benchmark; 78 $TotalTime = timediff ($EndTime, $StartTime); 79 print "Total time: ", timestr($TotalTime), "\n"; 80 81 ############################################################################### 82 83 # Generate MOL files for a SD file... 84 # 85 sub GenerateMolFiles { 86 my($FileIndex) = @_; 87 my($SDFile, $MOLFile, $MOLFileRoot, $OutFileRoot, $OverwriteFiles, $UseDataField, $DataFieldName, $UseMolName, $CmpdCount, $MolEndDelimiter, $CmpdString, @CmpdLines, %DataFieldValues); 88 89 $SDFile = $SDFilesList[$FileIndex]; 90 91 if (!open SDFILE, "$SDFile") { 92 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 93 return; 94 } 95 96 $CmpdCount = 0; 97 $MolEndDelimiter = "M END"; 98 99 $OutFileRoot = $SDFilesInfo{OutFileRoot}[$FileIndex]; 100 $OverwriteFiles = $OptionsInfo{OverwriteFiles}; 101 102 $UseDataField = ($OptionsInfo{Mode} =~ /^DataField$/i) ? 1 : 0; 103 $DataFieldName = $OptionsInfo{DataField}; 104 105 $UseMolName = ($OptionsInfo{Mode} =~ /^MolName$/i) ? 1 : 0; 106 107 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 108 $CmpdCount++; 109 110 # Setup MOL file name... 111 $MOLFileRoot = ''; 112 if ($UseDataField) { 113 @CmpdLines = split "\n", $CmpdString; 114 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 115 if (exists $DataFieldValues{$DataFieldName}) { 116 $MOLFileRoot = $DataFieldValues{$DataFieldName}; 117 } 118 } 119 elsif ($UseMolName) { 120 @CmpdLines = split "\n", $CmpdString; 121 $MOLFileRoot = $CmpdLines[0]; 122 } 123 124 # Check for any invalid file name characters in data field or molname values... 125 if ($MOLFileRoot && $MOLFileRoot =~ /[^a-zA-Z0-9_]/) { 126 $MOLFileRoot =~ s/[^a-zA-Z0-9_]//g; 127 } 128 # Fall back plan for MOL file name... 129 if (!$MOLFileRoot) { 130 $MOLFileRoot = "${OutFileRoot}Cmpd${CmpdCount}"; 131 } 132 133 $MOLFile = "${MOLFileRoot}.mol"; 134 135 if (!$OverwriteFiles) { 136 if (-e $MOLFile) { 137 warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: New MOL file, $MOLFile, already exists\n"; 138 next CMPDSTRING; 139 } 140 } 141 142 if (!($CmpdString =~ /$MolEndDelimiter/)) { 143 warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: Invalid compound data\n"; 144 next CMPDSTRING; 145 } 146 147 # Write out MOL file... 148 149 print "Generating $MOLFile file...\n"; 150 open MOLFILE, ">$MOLFile" or die "Error: Can't open $MOLFile: $! \n"; 151 ($CmpdString) = split "$MolEndDelimiter", $CmpdString; 152 print MOLFILE "$CmpdString"; 153 print MOLFILE "$MolEndDelimiter\n"; 154 close MOLFILE; 155 156 } 157 158 close SDFILE; 159 } 160 161 # Retrieve information about SD files... 162 # 163 sub RetrieveSDFilesInfo { 164 my($SDFile, $Index, $FileDir, $FileName, $FileExt, $OutFileRoot); 165 166 %SDFilesInfo = (); 167 @{$SDFilesInfo{FileOkay}} = (); 168 @{$SDFilesInfo{OutFileRoot}} = (); 169 170 FILELIST: for $Index (0 .. $#SDFilesList) { 171 $SDFile = $SDFilesList[$Index]; 172 173 $SDFilesInfo{FileOkay}[$Index] = 0; 174 $SDFilesInfo{OutFileRoot}[$Index] = ''; 175 176 $SDFile = $SDFilesList[$Index]; 177 if (!(-e $SDFile)) { 178 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 179 next FILELIST; 180 } 181 if (!CheckFileType($SDFile, "sd sdf")) { 182 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 183 next FILELIST; 184 } 185 186 # Setup output file root... 187 $FileDir = ""; $FileName = ""; $FileExt = ""; 188 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 189 190 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 191 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 192 if ($RootFileName && $RootFileExt) { 193 $FileName = $RootFileName; 194 } 195 else { 196 $FileName = $OptionsInfo{OutFileRoot}; 197 } 198 $OutFileRoot = $FileName; 199 } 200 else { 201 $OutFileRoot = "$FileName"; 202 } 203 204 $SDFilesInfo{FileOkay}[$Index] = 1; 205 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 206 } 207 } 208 209 # Process option values... 210 sub ProcessOptions { 211 %OptionsInfo = (); 212 213 $OptionsInfo{Mode} = $Options{mode}; 214 215 $OptionsInfo{DataField} = ''; 216 if ($Options{mode} =~ /^DataField$/i) { 217 if (!$Options{datafield}) { 218 die "Error: You must specify a value for \"-d, --DataField\" option in \"DataField\" \"-m, --mode\". \n"; 219 } 220 $OptionsInfo{DataField} = $Options{datafield}; 221 } 222 223 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 224 225 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 226 } 227 228 # Setup script usage and retrieve command line arguments specified using various options... 229 sub SetupScriptUsage { 230 231 # Retrieve all the options... 232 %Options = (); 233 234 $Options{mode} = 'RootPrefix'; 235 236 if (!GetOptions(\%Options, "datafield|d=s", "help|h", "mode|m=s", "overwrite|o", "root|r=s", "workingdir|w=s")) { 237 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 238 } 239 if ($Options{workingdir}) { 240 if (! -d $Options{workingdir}) { 241 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 242 } 243 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 244 } 245 246 if ($Options{mode} !~ /^(DataField|MolName|RootPrefix)$/i) { 247 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: DataField, MolName, RootPrefix\n"; 248 } 249 } 250