1 #!/usr/bin/perl -w 2 # 3 # File: SplitTextFiles.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use TextUtil; 34 35 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 36 37 # Autoflush STDOUT 38 $| = 1; 39 40 # Starting message... 41 $ScriptName = basename $0; 42 print "\n$ScriptName:Starting...\n\n"; 43 $StartTime = new Benchmark; 44 45 # Get the options and setup script... 46 SetupScriptUsage(); 47 if ($Options{help} || @ARGV < 1) { 48 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 49 } 50 51 my(@TextFilesList); 52 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 53 54 # Process options... 55 print "Processing options...\n"; 56 my(%OptionsInfo); 57 ProcessOptions(); 58 59 print "Checking input text file(s)...\n"; 60 my(%TextFilesInfo); 61 RetrieveTextFilesInfo(); 62 63 # Generate output files... 64 my($FileIndex); 65 if (@TextFilesList > 1) { 66 print "\nProcessing text files...\n"; 67 } 68 for $FileIndex (0 .. $#TextFilesList) { 69 if ($TextFilesInfo{FileOkay}[$FileIndex]) { 70 print "\nProcessing file $TextFilesList[$FileIndex]...\n"; 71 SplitTextFile($FileIndex); 72 } 73 } 74 75 print "\n$ScriptName:Done...\n\n"; 76 77 $EndTime = new Benchmark; 78 $TotalTime = timediff ($EndTime, $StartTime); 79 print "Total time: ", timestr($TotalTime), "\n"; 80 81 ############################################################################### 82 83 # Split a Text file... 84 # 85 sub SplitTextFile { 86 my($FileIndex) = @_; 87 my($TextFile, $LineCount, $MaxLinesPerFile, $MaxNumOfFiles); 88 89 $TextFile = $TextFilesList[$FileIndex]; 90 91 if (!open TEXTFILE, "$TextFile") { 92 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 93 return; 94 } 95 96 $MaxNumOfFiles = $OptionsInfo{NumOfFiles}; 97 98 # Count number of lines to figure out maximum number of lines per file... 99 $LineCount = 0; 100 while (<TEXTFILE>) { 101 $LineCount++; 102 } 103 close TEXTFILE; 104 105 if ($LineCount < $MaxNumOfFiles) { 106 warn "Warning: Ignoring file $TextFile: Total number of lines, $LineCount, is smaller than\nnumber of new files, $MaxNumOfFiles\n"; 107 return; 108 } 109 110 $MaxLinesPerFile = int $LineCount / $MaxNumOfFiles; 111 112 GenerateTextFiles($FileIndex, $MaxNumOfFiles, $MaxLinesPerFile); 113 } 114 115 # Generate new Text files... 116 # 117 sub GenerateTextFiles { 118 my($FileIndex, $NumOfFiles, $NumOfLinesPerFile) = @_; 119 my($TextFile, $LineCount, $NewFileIndex, $NewFileName, $MaxLinesCount, $InDelim, $OutDelim, $OutQuote, $ColLabelsLine, $Line, @ColLabels, @Words, @NewTextFilesList); 120 121 # Setup new file names list... 122 @NewTextFilesList = (); 123 for $NewFileIndex (1 .. $NumOfFiles) { 124 $NewFileName = $TextFilesInfo{OutFileRoot}[$FileIndex] . "Part${NewFileIndex}." . $TextFilesInfo{OutFileExt}[$FileIndex]; 125 if (!$OptionsInfo{OverwriteFiles}) { 126 if (-e $NewFileName) { 127 warn "Warning: Ignoring file $TextFile: New Text file, $NewFileName, already exists\n"; 128 return; 129 } 130 } 131 push @NewTextFilesList, $NewFileName; 132 } 133 134 $TextFile = $TextFilesList[$FileIndex]; 135 136 if (!open TEXTFILE, "$TextFile") { 137 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 138 return; 139 } 140 141 $InDelim = $TextFilesInfo{InDelim}[$FileIndex]; 142 143 $OutDelim = $OptionsInfo{OutDelim}; 144 $OutQuote = $OptionsInfo{OutQuote}; 145 146 $MaxLinesCount = $NumOfLinesPerFile; 147 $LineCount = 0; 148 $NewFileIndex = 1; 149 150 open NEWTEXTFILE, ">$NewTextFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewTextFilesList[$NewFileIndex -1]: $! \n"; 151 print "Generating $NewTextFilesList[$NewFileIndex - 1] file...\n"; 152 153 if ($OptionsInfo{Label}) { 154 if ($OptionsInfo{Fast}) { 155 $ColLabelsLine = GetTextLine(\*TEXTFILE); 156 } 157 else { 158 $Line = GetTextLine(\*TEXTFILE); 159 @ColLabels = quotewords($InDelim, 0, $Line); 160 $ColLabelsLine = JoinWords(\@ColLabels, $OutDelim, $OutQuote); 161 } 162 print NEWTEXTFILE "$ColLabelsLine\n"; 163 } 164 165 while ($Line = GetTextLine(\*TEXTFILE)) { 166 $LineCount++; 167 168 if (!$Options{fast}) { 169 @Words = quotewords($InDelim, 0, $Line); 170 $Line = JoinWords(\@Words, $OutDelim, $OutQuote); 171 } 172 print NEWTEXTFILE "$Line\n"; 173 174 if ($NewFileIndex <= $NumOfFiles) { 175 if ($LineCount >= $MaxLinesCount) { 176 if ($NewFileIndex < $NumOfFiles) { 177 close NEWTEXTFILE; 178 } 179 $NewFileIndex++; 180 $MaxLinesCount = $NumOfLinesPerFile * $NewFileIndex; 181 182 if ($NewFileIndex <= $NumOfFiles) { 183 open NEWTEXTFILE, ">$NewTextFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewTextFilesList[$NewFileIndex -1]: $! \n"; 184 print "Generating $NewTextFilesList[$NewFileIndex - 1] file...\n"; 185 186 if ($OptionsInfo{Label}) { 187 print NEWTEXTFILE "$ColLabelsLine\n"; 188 } 189 } 190 } 191 } 192 } 193 close NEWTEXTFILE; 194 close TEXTFILE; 195 } 196 197 # Retrieve information about Text files... 198 sub RetrieveTextFilesInfo { 199 my($Index, $TextFile, $InDelim, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt); 200 201 %TextFilesInfo = (); 202 @{$TextFilesInfo{FileOkay}} = (); 203 @{$TextFilesInfo{InDelim}} = (); 204 @{$TextFilesInfo{OutFileRoot}} = (); 205 @{$TextFilesInfo{OutFileExt}} = (); 206 207 FILELIST: for $Index (0 .. $#TextFilesList) { 208 $TextFilesInfo{FileOkay}[$Index] = 0; 209 $TextFilesInfo{InDelim}[$Index] = ""; 210 $TextFilesInfo{OutFileRoot}[$Index] = ""; 211 $TextFilesInfo{OutFileExt}[$Index] = ""; 212 213 $TextFile = $TextFilesList[$Index]; 214 if (!(-e $TextFile)) { 215 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 216 next FILELIST; 217 } 218 if (!CheckFileType($TextFile, "csv tsv")) { 219 warn "Warning: Ignoring file $TextFile: It's not a Text file\n"; 220 next FILELIST; 221 } 222 if (! open TEXTFILE, "$TextFile") { 223 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 224 next FILELIST; 225 } 226 close TEXTFILE; 227 228 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 229 230 # Setup input delimiter... 231 $InDelim = ''; 232 if (!$OptionsInfo{Fast}) { 233 if ($FileExt =~ /^tsv$/i) { 234 $InDelim = "\t"; 235 } 236 else { 237 $InDelim = "\,"; 238 if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) { 239 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; 240 next FILELIST; 241 } 242 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { 243 $InDelim = "\;"; 244 } 245 } 246 } 247 248 # Setup output file root... 249 $OutFileExt = $OptionsInfo{Fast} ? $FileExt : (($Options{outdelim} =~ /^tab$/i ) ? "tsv" : "csv"); 250 251 if ($OptionsInfo{OutFileRoot} && (@TextFilesList == 1)) { 252 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 253 if ($RootFileName && $RootFileExt) { 254 $FileName = $RootFileName; 255 } 256 else { 257 $FileName = $OptionsInfo{OutFileRoot}; 258 } 259 $OutFileRoot = $FileName; 260 } 261 else { 262 $OutFileRoot = $FileName; 263 } 264 265 $TextFilesInfo{FileOkay}[$Index] = 1; 266 $TextFilesInfo{InDelim}[$Index] = $InDelim; 267 $TextFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 268 $TextFilesInfo{OutFileExt}[$Index] = $OutFileExt; 269 } 270 } 271 272 # Process option values... 273 sub ProcessOptions { 274 275 %OptionsInfo = (); 276 277 $OptionsInfo{Fast} = defined $Options{fast} ? $Options{fast} : undef; 278 279 $OptionsInfo{InDelim} = $Options{indelim}; 280 $OptionsInfo{Label} = ($Options{label} =~ /^yes$/i) ? 1 : 0; 281 282 $OptionsInfo{NumOfFiles} = $Options{numfiles}; 283 284 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef; 285 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? $Options{overwrite} : undef; 286 287 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,"); 288 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; 289 } 290 291 # Setup script usage and retrieve command line arguments specified using various options... 292 sub SetupScriptUsage { 293 294 # Retrieve all the options... 295 %Options = (); 296 $Options{label} = "yes"; 297 $Options{numfiles} = 2; 298 $Options{indelim} = "comma"; 299 $Options{outdelim} = "comma"; 300 $Options{quote} = "yes"; 301 if (!GetOptions(\%Options, "fast|f", "help|h", "indelim=s", "label|l=s", "numfiles|n=i", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "workingdir|w=s")) { 302 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 303 } 304 if ($Options{workingdir}) { 305 if (! -d $Options{workingdir}) { 306 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 307 } 308 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 309 } 310 if ($Options{numfiles} < 2) { 311 die "Error: The value specified, $Options{numfiles}, for option \"-n --numfiles\" is not valid. Allowed values: >= 2 \n"; 312 } 313 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 314 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 315 } 316 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 317 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 318 } 319 if ($Options{quote} !~ /^(yes|no)$/i) { 320 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 321 } 322 if ($Options{label} !~ /^(yes|no)$/i) { 323 die "Error: The value specified, $Options{label}, for option \"-l --label\" is not valid. Allowed values: yes or no\n"; 324 } 325 } 326