MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: SplitTextFiles.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use TextUtil;
  34 
  35 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  36 
  37 # Autoflush STDOUT
  38 $| = 1;
  39 
  40 # Starting message...
  41 $ScriptName = basename $0;
  42 print "\n$ScriptName:Starting...\n\n";
  43 $StartTime = new Benchmark;
  44 
  45 # Get the options and setup script...
  46 SetupScriptUsage();
  47 if ($Options{help} || @ARGV < 1) {
  48   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  49 }
  50 
  51 my(@TextFilesList);
  52 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  53 
  54 # Process options...
  55 print "Processing options...\n";
  56 my(%OptionsInfo);
  57 ProcessOptions();
  58 
  59 print "Checking input text file(s)...\n";
  60 my(%TextFilesInfo);
  61 RetrieveTextFilesInfo();
  62 
  63 # Generate output files...
  64 my($FileIndex);
  65 if (@TextFilesList > 1) {
  66   print "\nProcessing text files...\n";
  67 }
  68 for $FileIndex (0 .. $#TextFilesList) {
  69   if ($TextFilesInfo{FileOkay}[$FileIndex]) {
  70     print "\nProcessing file $TextFilesList[$FileIndex]...\n";
  71     SplitTextFile($FileIndex);
  72   }
  73 }
  74 
  75 print "\n$ScriptName:Done...\n\n";
  76 
  77 $EndTime = new Benchmark;
  78 $TotalTime = timediff ($EndTime, $StartTime);
  79 print "Total time: ", timestr($TotalTime), "\n";
  80 
  81 ###############################################################################
  82 
  83 # Split a Text file...
  84 #
  85 sub SplitTextFile {
  86   my($FileIndex) = @_;
  87   my($TextFile, $LineCount, $MaxLinesPerFile, $MaxNumOfFiles);
  88 
  89   $TextFile = $TextFilesList[$FileIndex];
  90 
  91   if (!open TEXTFILE, "$TextFile") {
  92     warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
  93     return;
  94   }
  95 
  96   $MaxNumOfFiles = $OptionsInfo{NumOfFiles};
  97 
  98   # Count number of lines to figure out maximum number of lines per file...
  99   $LineCount = 0;
 100   while (<TEXTFILE>) {
 101       $LineCount++;
 102   }
 103   close TEXTFILE;
 104 
 105   if ($LineCount < $MaxNumOfFiles) {
 106     warn "Warning: Ignoring file $TextFile: Total number of lines, $LineCount, is smaller than\nnumber of new files, $MaxNumOfFiles\n";
 107     return;
 108   }
 109 
 110   $MaxLinesPerFile = int $LineCount / $MaxNumOfFiles;
 111 
 112   GenerateTextFiles($FileIndex, $MaxNumOfFiles, $MaxLinesPerFile);
 113 }
 114 
 115 # Generate new Text files...
 116 #
 117 sub GenerateTextFiles {
 118   my($FileIndex, $NumOfFiles, $NumOfLinesPerFile) = @_;
 119   my($TextFile, $LineCount, $NewFileIndex, $NewFileName, $MaxLinesCount, $InDelim, $OutDelim, $OutQuote, $ColLabelsLine, $Line, @ColLabels, @Words, @NewTextFilesList);
 120 
 121   # Setup new file names list...
 122   @NewTextFilesList = ();
 123   for $NewFileIndex (1 .. $NumOfFiles) {
 124     $NewFileName = $TextFilesInfo{OutFileRoot}[$FileIndex] . "Part${NewFileIndex}." . $TextFilesInfo{OutFileExt}[$FileIndex];
 125     if (!$OptionsInfo{OverwriteFiles}) {
 126       if (-e $NewFileName) {
 127         warn "Warning: Ignoring file $TextFile: New Text file, $NewFileName, already exists\n";
 128         return;
 129       }
 130     }
 131     push @NewTextFilesList, $NewFileName;
 132   }
 133 
 134   $TextFile = $TextFilesList[$FileIndex];
 135 
 136   if (!open TEXTFILE, "$TextFile") {
 137     warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 138     return;
 139   }
 140 
 141   $InDelim = $TextFilesInfo{InDelim}[$FileIndex];
 142 
 143   $OutDelim = $OptionsInfo{OutDelim};
 144   $OutQuote = $OptionsInfo{OutQuote};
 145 
 146   $MaxLinesCount = $NumOfLinesPerFile;
 147   $LineCount = 0;
 148   $NewFileIndex = 1;
 149 
 150   open NEWTEXTFILE, ">$NewTextFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewTextFilesList[$NewFileIndex -1]: $! \n";
 151   print "Generating $NewTextFilesList[$NewFileIndex - 1] file...\n";
 152 
 153   if ($OptionsInfo{Label}) {
 154     if ($OptionsInfo{Fast}) {
 155       $ColLabelsLine = GetTextLine(\*TEXTFILE);
 156     }
 157     else {
 158       $Line = GetTextLine(\*TEXTFILE);
 159       @ColLabels = quotewords($InDelim, 0, $Line);
 160       $ColLabelsLine = JoinWords(\@ColLabels, $OutDelim, $OutQuote);
 161     }
 162     print NEWTEXTFILE "$ColLabelsLine\n";
 163   }
 164 
 165   while ($Line = GetTextLine(\*TEXTFILE)) {
 166     $LineCount++;
 167 
 168     if (!$Options{fast}) {
 169       @Words = quotewords($InDelim, 0, $Line);
 170       $Line = JoinWords(\@Words, $OutDelim, $OutQuote);
 171     }
 172     print NEWTEXTFILE "$Line\n";
 173 
 174     if ($NewFileIndex <= $NumOfFiles) {
 175       if ($LineCount >= $MaxLinesCount) {
 176         if ($NewFileIndex < $NumOfFiles) {
 177           close NEWTEXTFILE;
 178         }
 179         $NewFileIndex++;
 180         $MaxLinesCount = $NumOfLinesPerFile * $NewFileIndex;
 181 
 182         if ($NewFileIndex <= $NumOfFiles) {
 183           open NEWTEXTFILE, ">$NewTextFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewTextFilesList[$NewFileIndex -1]: $! \n";
 184           print "Generating $NewTextFilesList[$NewFileIndex - 1] file...\n";
 185 
 186           if ($OptionsInfo{Label}) {
 187             print NEWTEXTFILE "$ColLabelsLine\n";
 188           }
 189         }
 190       }
 191     }
 192   }
 193   close NEWTEXTFILE;
 194   close TEXTFILE;
 195 }
 196 
 197 # Retrieve information about Text files...
 198 sub RetrieveTextFilesInfo {
 199   my($Index, $TextFile, $InDelim, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt);
 200 
 201   %TextFilesInfo = ();
 202   @{$TextFilesInfo{FileOkay}} = ();
 203   @{$TextFilesInfo{InDelim}} = ();
 204   @{$TextFilesInfo{OutFileRoot}} = ();
 205   @{$TextFilesInfo{OutFileExt}} = ();
 206 
 207   FILELIST: for $Index (0 .. $#TextFilesList) {
 208     $TextFilesInfo{FileOkay}[$Index] = 0;
 209     $TextFilesInfo{InDelim}[$Index] = "";
 210     $TextFilesInfo{OutFileRoot}[$Index] = "";
 211     $TextFilesInfo{OutFileExt}[$Index] = "";
 212 
 213     $TextFile = $TextFilesList[$Index];
 214     if (!(-e $TextFile)) {
 215       warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
 216       next FILELIST;
 217     }
 218     if (!CheckFileType($TextFile, "csv tsv")) {
 219       warn "Warning: Ignoring file $TextFile: It's not a Text file\n";
 220       next FILELIST;
 221     }
 222     if (! open TEXTFILE, "$TextFile") {
 223       warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 224       next FILELIST;
 225     }
 226     close TEXTFILE;
 227 
 228     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 229 
 230     # Setup input delimiter...
 231     $InDelim = '';
 232     if (!$OptionsInfo{Fast}) {
 233       if ($FileExt =~ /^tsv$/i) {
 234         $InDelim = "\t";
 235       }
 236       else {
 237         $InDelim = "\,";
 238         if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) {
 239           warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n";
 240           next FILELIST;
 241         }
 242         if ($OptionsInfo{InDelim} =~ /^semicolon$/i) {
 243           $InDelim = "\;";
 244         }
 245       }
 246     }
 247 
 248     # Setup output file root...
 249     $OutFileExt = $OptionsInfo{Fast} ? $FileExt : (($Options{outdelim} =~ /^tab$/i ) ? "tsv" : "csv");
 250 
 251     if ($OptionsInfo{OutFileRoot} && (@TextFilesList == 1)) {
 252       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 253       if ($RootFileName && $RootFileExt) {
 254         $FileName = $RootFileName;
 255       }
 256       else {
 257         $FileName = $OptionsInfo{OutFileRoot};
 258       }
 259       $OutFileRoot = $FileName;
 260     }
 261     else {
 262       $OutFileRoot = $FileName;
 263     }
 264 
 265     $TextFilesInfo{FileOkay}[$Index] = 1;
 266     $TextFilesInfo{InDelim}[$Index] = $InDelim;
 267     $TextFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 268     $TextFilesInfo{OutFileExt}[$Index] = $OutFileExt;
 269   }
 270 }
 271 
 272 # Process option values...
 273 sub ProcessOptions {
 274 
 275   %OptionsInfo = ();
 276 
 277   $OptionsInfo{Fast} = defined $Options{fast} ? $Options{fast} : undef;
 278 
 279   $OptionsInfo{InDelim} = $Options{indelim};
 280   $OptionsInfo{Label} = ($Options{label} =~ /^yes$/i) ? 1 : 0;
 281 
 282   $OptionsInfo{NumOfFiles} = $Options{numfiles};
 283 
 284   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef;
 285   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? $Options{overwrite} : undef;
 286 
 287   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,");
 288   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
 289 }
 290 
 291 # Setup script usage  and retrieve command line arguments specified using various options...
 292 sub SetupScriptUsage {
 293 
 294   # Retrieve all the options...
 295   %Options = ();
 296   $Options{label} = "yes";
 297   $Options{numfiles} = 2;
 298   $Options{indelim} = "comma";
 299   $Options{outdelim} = "comma";
 300   $Options{quote} = "yes";
 301   if (!GetOptions(\%Options, "fast|f", "help|h", "indelim=s", "label|l=s", "numfiles|n=i", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "workingdir|w=s")) {
 302     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 303   }
 304   if ($Options{workingdir}) {
 305     if (! -d $Options{workingdir}) {
 306       die "Error: The value specified, $Options{workingdir},  for option \"-w --workingdir\" is not a directory name.\n";
 307     }
 308     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 309   }
 310   if ($Options{numfiles} < 2) {
 311     die "Error: The value specified, $Options{numfiles},  for option \"-n --numfiles\" is not valid. Allowed values: >= 2 \n";
 312   }
 313   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 314     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
 315   }
 316   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 317     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 318   }
 319   if ($Options{quote} !~ /^(yes|no)$/i) {
 320     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 321   }
 322   if ($Options{label} !~ /^(yes|no)$/i) {
 323     die "Error: The value specified, $Options{label}, for option \"-l --label\" is not valid. Allowed values: yes or no\n";
 324   }
 325 }
 326