1 #!/usr/bin/perl -w 2 # 3 # File: SplitSDFiles.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2025 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Benchmark; 31 use SDFileUtil; 32 use FileUtil; 33 34 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 35 36 # Autoflush STDOUT 37 $| = 1; 38 39 # Starting message... 40 $ScriptName = basename $0; 41 print "\n$ScriptName:Starting...\n\n"; 42 $StartTime = new Benchmark; 43 44 # Get the options and setup script... 45 SetupScriptUsage(); 46 if ($Options{help} || @ARGV < 1) { 47 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 48 } 49 50 my(@SDFilesList); 51 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 52 53 # Process options... 54 print "Processing options...\n"; 55 my(%OptionsInfo); 56 ProcessOptions(); 57 58 # Setup information about input files... 59 my(%SDFilesInfo); 60 print "Checking input SD file(s)...\n"; 61 RetrieveSDFilesInfo(); 62 63 # Process input files.. 64 my($FileIndex); 65 if (@SDFilesList > 1) { 66 print "\nProcessing SD files...\n"; 67 } 68 for $FileIndex (0 .. $#SDFilesList) { 69 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 70 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 71 SplitSDFile($FileIndex); 72 } 73 } 74 print "\n$ScriptName:Done...\n\n"; 75 76 $EndTime = new Benchmark; 77 $TotalTime = timediff ($EndTime, $StartTime); 78 print "Total time: ", timestr($TotalTime), "\n"; 79 80 ############################################################################### 81 82 # Split a SD file... 83 # 84 sub SplitSDFile { 85 my($FileIndex) = @_; 86 87 if ($OptionsInfo{Mode} =~ /^Files$/i) { 88 SplitSDFileByNumOfFiles($FileIndex); 89 } 90 elsif ($OptionsInfo{Mode} =~ /^Cmpds$/i) { 91 SplitSDFileByNumOfCmpds($FileIndex); 92 } 93 } 94 95 # Split SD into specified number of files... 96 # 97 sub SplitSDFileByNumOfFiles { 98 my($FileIndex) = @_; 99 my($SDFile, $CmpdCount, $MaxCmpdsPerFile, $MaxNumOfFiles); 100 101 $SDFile = $SDFilesList[$FileIndex]; 102 103 if (!open SDFILE, "$SDFile") { 104 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 105 return; 106 } 107 108 $MaxNumOfFiles = $OptionsInfo{NumOfFiles}; 109 110 # Count number of compounds to figure out maximum number of compound per file... 111 $CmpdCount = 0; 112 while (<SDFILE>) { 113 if (/^\$\$\$\$/) { 114 $CmpdCount++; 115 } 116 } 117 close SDFILE; 118 119 if ($CmpdCount < $MaxNumOfFiles) { 120 warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is smaller than number of new files, $MaxNumOfFiles\n"; 121 return; 122 } 123 124 $MaxCmpdsPerFile = int $CmpdCount / $MaxNumOfFiles; 125 126 SplitSDFileByNumOfFilesAndCmpds($FileIndex, $MaxNumOfFiles, $MaxCmpdsPerFile); 127 } 128 129 # Split SD into files containing specified number of compounds... 130 # 131 sub SplitSDFileByNumOfCmpds { 132 my($FileIndex) = @_; 133 134 if ($OptionsInfo{NumOfCmpds} == 1) { 135 SplitSDFileByOneCmpdPerFile($FileIndex); 136 } 137 else { 138 SplitSDFileByNumOfCmpdsPerFile($FileIndex); 139 } 140 } 141 142 # Split SD into files containing one compound per file... 143 # 144 sub SplitSDFileByOneCmpdPerFile { 145 my($FileIndex) = @_; 146 my($SDFile, $NewSDFile, $NewSDFileRoot, $FileExt, $OutFileRoot, $OverwriteFiles, $UseDataField, $DataFieldName, $UseMolName, $CmpdCount, $CmpdString, @CmpdLines, %DataFieldValues); 147 148 $SDFile = $SDFilesList[$FileIndex]; 149 150 if (!open SDFILE, "$SDFile") { 151 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 152 return; 153 } 154 155 print "\n"; 156 157 $CmpdCount = 0; 158 159 $FileExt = $SDFilesInfo{FileExt}[$FileIndex]; 160 161 $OutFileRoot = $SDFilesInfo{OutFileRoot}[$FileIndex]; 162 $OverwriteFiles = $OptionsInfo{OverwriteFiles}; 163 164 $UseDataField = ($OptionsInfo{CmpdsMode} =~ /^DataField$/i) ? 1 : 0; 165 $DataFieldName = $OptionsInfo{DataField}; 166 167 $UseMolName = ($OptionsInfo{CmpdsMode} =~ /^MolName$/i) ? 1 : 0; 168 169 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 170 $CmpdCount++; 171 172 # Setup SD file name... 173 $NewSDFileRoot = ''; 174 if ($UseDataField) { 175 @CmpdLines = split "\n", $CmpdString; 176 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 177 if (exists $DataFieldValues{$DataFieldName}) { 178 $NewSDFileRoot = $DataFieldValues{$DataFieldName}; 179 } 180 } 181 elsif ($UseMolName) { 182 @CmpdLines = split "\n", $CmpdString; 183 $NewSDFileRoot = $CmpdLines[0]; 184 } 185 186 # Check for any invalid file name characters in data field or molname values... 187 if ($NewSDFileRoot && $NewSDFileRoot =~ /[^a-zA-Z0-9_]/) { 188 $NewSDFileRoot =~ s/[^a-zA-Z0-9_]//g; 189 } 190 191 # Fall back plan for SD file name... 192 if (!$NewSDFileRoot) { 193 $NewSDFileRoot = "${OutFileRoot}Cmpd${CmpdCount}"; 194 } 195 196 $NewSDFile = "${NewSDFileRoot}.${FileExt}"; 197 198 if (!$OverwriteFiles) { 199 if (-e $NewSDFile) { 200 warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: New SD file, $NewSDFile, already exists\n"; 201 next CMPDSTRING; 202 } 203 } 204 205 # Write out new SD file... 206 207 print "Generating $NewSDFile file\n"; 208 open NEWSDFILE, ">$NewSDFile" or die "Error: Can't open $NewSDFile: $! \n"; 209 print NEWSDFILE "$CmpdString\n"; 210 close NEWSDFILE; 211 212 } 213 close SDFILE; 214 } 215 216 # Split SD into files containing specified number of compounds per file... 217 # 218 sub SplitSDFileByNumOfCmpdsPerFile { 219 my($FileIndex) = @_; 220 my($SDFile, $CmpdCount, $MaxCmpdsPerFile, $MaxNumOfFiles); 221 222 $SDFile = $SDFilesList[$FileIndex]; 223 224 if (!open SDFILE, "$SDFile") { 225 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 226 return; 227 } 228 229 $MaxCmpdsPerFile = $OptionsInfo{NumOfCmpds}; 230 231 # Count number of compounds to figure out maximum number of files... 232 $CmpdCount = 0; 233 while (<SDFILE>) { 234 if (/^\$\$\$\$/) { 235 $CmpdCount++; 236 } 237 } 238 close SDFILE; 239 240 $MaxNumOfFiles = int $CmpdCount / $MaxCmpdsPerFile; 241 242 if (($MaxNumOfFiles * $MaxCmpdsPerFile) < $CmpdCount) { 243 $MaxNumOfFiles++; 244 } 245 246 if ($CmpdCount <= $MaxCmpdsPerFile) { 247 warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is <= specified number of compunds per file, $MaxCmpdsPerFile\n"; 248 return; 249 } 250 251 SplitSDFileByNumOfFilesAndCmpds($FileIndex, $MaxNumOfFiles, $MaxCmpdsPerFile); 252 } 253 254 # Split SD files into specified number of files with specified number of compounds 255 # in each file... 256 # 257 sub SplitSDFileByNumOfFilesAndCmpds { 258 my($FileIndex, $NumOfFiles, $NumOfCmpdsPerFile) = @_; 259 my($SDFile, $CmpdCount, $NewFileIndex, $NewFileName, $MaxCmpdsCount, @NewSDFilesList); 260 261 $SDFile = $SDFilesList[$FileIndex]; 262 263 if (!open SDFILE, "$SDFile") { 264 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 265 return; 266 } 267 268 # Setup new file names list... 269 @NewSDFilesList = (); 270 for $NewFileIndex (1 .. $NumOfFiles) { 271 $NewFileName = $SDFilesInfo{OutFileRoot}[$FileIndex] . "Part${NewFileIndex}." . $SDFilesInfo{FileExt}[$FileIndex]; 272 if (!$OptionsInfo{OverwriteFiles}) { 273 if (-e $NewFileName) { 274 warn "Warning: Ignoring file $SDFile: New SD file, $NewFileName, already exists\n"; 275 return; 276 } 277 } 278 push @NewSDFilesList, $NewFileName; 279 } 280 281 $MaxCmpdsCount = $NumOfCmpdsPerFile; 282 283 $CmpdCount = 0; 284 $NewFileIndex = 1; 285 286 open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex -1]: $! \n"; 287 print "\nGenerating $NewSDFilesList[$NewFileIndex - 1] file\n"; 288 289 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; 290 291 while (<SDFILE>) { 292 s/(\r\n)|(\r)/\n/g; 293 print NEWSDFILE; 294 295 if ( /^\$\$\$\$/ ) { 296 $CmpdCount++; 297 if ($NewFileIndex <= $NumOfFiles) { 298 if ($CmpdCount >= $MaxCmpdsCount) { 299 if ($NewFileIndex < $NumOfFiles) { 300 close NEWSDFILE; 301 } 302 $NewFileIndex++; 303 $MaxCmpdsCount = $NumOfCmpdsPerFile * $NewFileIndex; 304 305 if ($NewFileIndex <= $NumOfFiles) { 306 open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex - 1]: $! \n"; 307 print "Generating $NewSDFilesList[$NewFileIndex - 1] file...\n"; 308 } 309 } 310 } 311 } 312 } 313 close NEWSDFILE; 314 } 315 316 # Retrieve information about SD files... 317 # 318 sub RetrieveSDFilesInfo { 319 my($SDFile, $Index, $FileDir, $FileName, $FileExt, $OutFileRoot); 320 321 %SDFilesInfo = (); 322 @{$SDFilesInfo{FileOkay}} = (); 323 @{$SDFilesInfo{FileExt}} = (); 324 @{$SDFilesInfo{OutFileRoot}} = (); 325 326 FILELIST: for $Index (0 .. $#SDFilesList) { 327 $SDFile = $SDFilesList[$Index]; 328 329 $SDFilesInfo{FileOkay}[$Index] = 0; 330 $SDFilesInfo{FileExt}[$Index] = ''; 331 $SDFilesInfo{OutFileRoot}[$Index] = ''; 332 333 $SDFile = $SDFilesList[$Index]; 334 if (!(-e $SDFile)) { 335 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 336 next FILELIST; 337 } 338 if (!CheckFileType($SDFile, "sd sdf")) { 339 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 340 next FILELIST; 341 } 342 343 # Setup output file root... 344 $FileDir = ""; $FileName = ""; $FileExt = ""; 345 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 346 347 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 348 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 349 if ($RootFileName && $RootFileExt) { 350 $FileName = $RootFileName; 351 } 352 else { 353 $FileName = $OptionsInfo{OutFileRoot}; 354 } 355 $OutFileRoot = $FileName; 356 } 357 else { 358 $OutFileRoot = "$FileName"; 359 } 360 361 $SDFilesInfo{FileOkay}[$Index] = 1; 362 $SDFilesInfo{FileExt}[$Index] = $FileExt; 363 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 364 } 365 } 366 367 # Process option values... 368 sub ProcessOptions { 369 %OptionsInfo = (); 370 371 $OptionsInfo{Mode} = $Options{mode}; 372 373 $OptionsInfo{CmpdsMode} = $Options{cmpdsmode}; 374 375 $OptionsInfo{NumOfFiles} = $Options{numfiles}; 376 $OptionsInfo{NumOfCmpds} = $Options{numcmpds}; 377 378 $OptionsInfo{DataField} = ''; 379 if ($Options{mode} =~ /^Cmpds$/i && $Options{cmpdsmode} =~ /^DataField$/i) { 380 if (!$Options{datafield}) { 381 die "Error: You must specify a value for \"-d, --DataField\" option in \"DataField\" value of \"-c, --CmpdsMode\" during \"Cmpds\" \"-m, --mode\" value. \n"; 382 } 383 $OptionsInfo{DataField} = $Options{datafield}; 384 } 385 386 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 387 388 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 389 } 390 391 392 # Setup script usage and retrieve command line arguments specified using various options... 393 sub SetupScriptUsage { 394 395 # Retrieve all the options... 396 %Options = (); 397 398 $Options{cmpdsmode} = 'RootPrefix'; 399 $Options{mode} = 'Files'; 400 401 $Options{numfiles} = 2; 402 $Options{numcmpds} = 1; 403 404 405 if (!GetOptions(\%Options, "cmpdsmode|c=s", "datafield|d=s", "help|h", "mode|m=s", "numfiles|n=i", "numcmpds=i", "overwrite|o", "root|r=s", "workingdir|w=s")) { 406 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 407 } 408 if ($Options{workingdir}) { 409 if (! -d $Options{workingdir}) { 410 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 411 } 412 chdir $Options{workingdir} or die "Error: Error: Couldn't chdir $Options{workingdir}: $! \n"; 413 } 414 if ($Options{cmpdsmode} !~ /^(DataField|MolName|RootPrefix)$/i) { 415 die "Error: The value specified, $Options{cmpdsmode}, for option \"-c, --CmpdsMode\" is not valid. Allowed values: DataField, MolName, RootPrefix\n"; 416 } 417 if ($Options{mode} !~ /^(Cmpds|Files)$/i) { 418 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: Cmpds, Files\n"; 419 } 420 if ($Options{numfiles} < 2) { 421 die "Error: The value specified, $Options{numfiles}, for option \"-n --numfiles\" is not valid. Allowed values: >= 2 \n"; 422 } 423 if ($Options{numcmpds} < 1) { 424 die "Error: The value specified, $Options{numcmpds}, for option \"-n --numcmpds\" is not valid. Allowed values: >= 1 \n"; 425 } 426 } 427