MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: TextFilesToSDFiles.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use TextUtil;
  34 use SDFileUtil;
  35 
  36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  37 
  38 # Autoflush STDOUT
  39 $| = 1;
  40 
  41 # Starting message...
  42 $ScriptName = basename $0;
  43 print "\n$ScriptName:Starting...\n\n";
  44 $StartTime = new Benchmark;
  45 
  46 # Get the options and setup script...
  47 SetupScriptUsage();
  48 if ($Options{help} || @ARGV < 1) {
  49   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  50 }
  51 
  52 my(@TextFilesList);
  53 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  54 
  55 print "Processing options...\n";
  56 my(%OptionsInfo);
  57 ProcessOptions();
  58 
  59 print "Checking input text file(s)...\n";
  60 my(%TextFilesInfo);
  61 RetrieveTextFilesInfo();
  62 
  63 # Generate output files...
  64 my($FileIndex);
  65 if (@TextFilesList > 1) {
  66   print "\nProcessing text files...\n";
  67 }
  68 for $FileIndex (0 .. $#TextFilesList) {
  69   if ($TextFilesInfo{FileOkay}[$FileIndex]) {
  70     print "\nProcessing file $TextFilesList[$FileIndex]...\n";
  71     ConvertTextFile($FileIndex);
  72   }
  73 }
  74 print "\n$ScriptName:Done...\n\n";
  75 
  76 $EndTime = new Benchmark;
  77 $TotalTime = timediff ($EndTime, $StartTime);
  78 print "Total time: ", timestr($TotalTime), "\n";
  79 
  80 ###############################################################################
  81 
  82 # Convert text file to SD file...
  83 sub ConvertTextFile {
  84   my($Index) = @_;
  85   my($TextFile, $SDFile, $Line, $InDelim, $Label, $Value, $ColIndex, $ColCount, @ColLabels, @LineWords);
  86 
  87   $TextFile = $TextFilesList[$Index];
  88   $InDelim = $TextFilesInfo{InDelim}[$Index];
  89   $SDFile = $TextFilesInfo{OutSDFile}[$Index];
  90   @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]};
  91   $ColCount = @ColLabels;
  92 
  93   print "Generating SD file $SDFile...\n";
  94   open SDFILE, ">$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
  95   open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
  96   if ($OptionsInfo{ColLabelsPresent}) {
  97     # Skip over column labels from old file...
  98     $Line = GetTextLine(\*TEXTFILE);
  99   }
 100   my($Date) = GenerateMiscLineDateStamp();
 101   while ($Line = GetTextLine(\*TEXTFILE)) {
 102     @LineWords = quotewords($InDelim, 0, $Line);
 103 
 104     # Write out empty CTAB block...
 105     print SDFILE GenerateEmptyCtabBlockLines($Date), "\n";
 106 
 107     # Write out data fields and values...
 108     for $ColIndex (0 .. $#LineWords) {
 109       if ($ColIndex < $ColCount) {
 110         $Label = $ColLabels[$ColIndex];
 111         $Value = $LineWords[$ColIndex];
 112         print SDFILE "> <$Label>\n$Value\n\n";
 113       }
 114     }
 115     print SDFILE "\$\$\$\$\n";
 116   }
 117   close SDFILE;
 118   close TEXTFILE;
 119 }
 120 
 121 # Retrieve information about input text files...
 122 sub RetrieveTextFilesInfo {
 123   my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @LineWords, @ColLabels, $OutFileRoot,  $OutFile, $ColNum, $ColLabel);
 124 
 125   %TextFilesInfo = ();
 126 
 127   @{$TextFilesInfo{FileOkay}} = ();
 128   @{$TextFilesInfo{ColCount}} = ();
 129   @{$TextFilesInfo{ColLabels}} = ();
 130   @{$TextFilesInfo{InDelim}} = ();
 131   @{$TextFilesInfo{OutSDFile}} = ();
 132 
 133 
 134   FILELIST: for $Index (0 .. $#TextFilesList) {
 135     $TextFile = $TextFilesList[$Index];
 136 
 137     $TextFilesInfo{FileOkay}[$Index] = 0;
 138     $TextFilesInfo{ColCount}[$Index] = 0;
 139     $TextFilesInfo{InDelim}[$Index] = "";
 140     $TextFilesInfo{OutSDFile}[$Index] = "";
 141 
 142     @{$TextFilesInfo{ColLabels}[$Index]} = ();
 143 
 144     if (!(-e $TextFile)) {
 145       warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
 146       next FILELIST;
 147     }
 148     if (!CheckFileType($TextFile, "csv tsv")) {
 149       warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
 150       next FILELIST;
 151     }
 152     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 153     if ($FileExt =~ /^tsv$/i) {
 154       $InDelim = "\t";
 155     }
 156     else {
 157       $InDelim = "\,";
 158       if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 159         warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n";
 160         next FILELIST;
 161       }
 162       if ($Options{indelim} =~ /^semicolon$/i) {
 163         $InDelim = "\;";
 164       }
 165     }
 166     if (!open TEXTFILE, "$TextFile") {
 167       warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 168       next FILELIST;
 169     }
 170     $Line = GetTextLine(\*TEXTFILE);
 171     @LineWords = quotewords($InDelim, 0, $Line);
 172     @ColLabels = ();
 173     if ($OptionsInfo{ColLabelsPresent}) {
 174       push @ColLabels, @LineWords;
 175     }
 176     else {
 177       for $ColNum (1 .. @LineWords) {
 178         $ColLabel = "Column${ColNum}Data";
 179         push @ColLabels, $ColLabel;
 180       }
 181     }
 182     close TEXTFILE;
 183 
 184     $FileDir = ""; $FileName = ""; $FileExt = "";
 185     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 186     if ($Options{root} && (@TextFilesList == 1)) {
 187       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 188       if ($RootFileName && $RootFileExt) {
 189         $FileName = $RootFileName;
 190       }
 191       else {
 192         $FileName = $Options{root};
 193       }
 194       $OutFileRoot = $FileName;
 195     }
 196     else {
 197       $OutFileRoot = "${FileName}WithNoStrData";
 198     }
 199 
 200     $OutFile = "${OutFileRoot}.sdf";
 201     if (!$Options{overwrite}) {
 202       if (-e $OutFile) {
 203         warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n";
 204         next FILELIST;
 205       }
 206     }
 207     $TextFilesInfo{FileOkay}[$Index] = 1;
 208     $TextFilesInfo{InDelim}[$Index] = $InDelim;
 209     $TextFilesInfo{OutSDFile}[$Index] = "$OutFile";
 210 
 211     $TextFilesInfo{ColCount}[$Index] = @ColLabels;
 212     push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
 213   }
 214 }
 215 
 216 # Process option values...
 217 sub ProcessOptions {
 218   %OptionsInfo = ();
 219 
 220   $OptionsInfo{Label} = $Options{label};
 221   $OptionsInfo{ColLabelsPresent} = ($Options{label} =~ /^yes$/i) ? 1 : 0;
 222 
 223   $OptionsInfo{InDelim} = $Options{indelim};
 224   $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
 225 
 226   $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef;
 227 
 228 }
 229 
 230 # Setup script usage  and retrieve command line arguments specified using various options...
 231 sub SetupScriptUsage {
 232 
 233   # Retrieve all the options...
 234   %Options = ();
 235   $Options{label} = "yes";
 236   $Options{indelim} = "comma";
 237   if (!GetOptions(\%Options, "help|h", "indelim=s", "label|l=s", "overwrite|o", "root|r=s", "workingdir|w=s")) {
 238     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 239   }
 240   if ($Options{workingdir}) {
 241     if (! -d $Options{workingdir}) {
 242       die "Error: The value specified, $Options{workingdir},  for option \"-w --workingdir\" is not a directory name.\n";
 243     }
 244     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 245   }
 246   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 247     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
 248   }
 249   if ($Options{label} !~ /^(yes|no)$/i) {
 250     die "Error: The value specified, $Options{label}, for option \"-l --label\" is not valid. Allowed values: yes or no\n";
 251   }
 252 }
 253