MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: InfoNucleicAcids.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use TextUtil;
  34 use NucleicAcids;
  35 
  36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  37 
  38 # Autoflush STDOUT
  39 $| = 1;
  40 
  41 # Starting message...
  42 $ScriptName = basename($0);
  43 print "\n$ScriptName: Starting...\n\n";
  44 $StartTime = new Benchmark;
  45 
  46 # Get the options and setup script...
  47 SetupScriptUsage();
  48 if ($Options{help}) {
  49   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  50 }
  51 
  52 print "Processing options...\n";
  53 my(%OptionsInfo);
  54 ProcessOptions();
  55 
  56 ListNucleicAcidProperties();
  57 print "\n$ScriptName:Done...\n\n";
  58 
  59 $EndTime = new Benchmark;
  60 $TotalTime = timediff ($EndTime, $StartTime);
  61 print "Total time: ", timestr($TotalTime), "\n";
  62 
  63 ###############################################################################
  64 
  65 # List data for an nucleic acid...
  66 sub ListNucleicAcidData {
  67   my($DataLabelRef, $DataValueRef) = @_;
  68   my($Index, $Line, $Value);
  69 
  70   if ($OptionsInfo{NucleicAcidRowsOutput}) {
  71     $Line = '';
  72     # Format data...
  73     if ($OptionsInfo{OutQuote} || $Options{outdelim} !~ /^comma$/i) {
  74       $Line = JoinWords($DataValueRef, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
  75     }
  76     else {
  77       # Always quote values containing commas...
  78       $Line = ($DataValueRef->[0] =~ /\,/) ? qq("$DataValueRef->[0]") : $DataValueRef->[0];
  79       for $Index (1 .. $#{$DataValueRef} ) {
  80         $Value = $DataValueRef->[$Index];
  81         if ($Value =~ /\,/) {
  82           $Value = qq("$Value");
  83         }
  84         $Line .= $OptionsInfo{OutDelim} . $Value;
  85       }
  86     }
  87     if ($OptionsInfo{FileOutput}) {
  88       print OUTFILE "$Line\n";
  89     }
  90     else {
  91       print "$Line\n";
  92     }
  93   }
  94   else {
  95     # Format and list data...
  96     $Line = '';
  97     for $Index (0 .. $#{$DataLabelRef} ) {
  98       $Line = $DataLabelRef->[$Index] . ' ' . $DataValueRef->[$Index];
  99       if ($OptionsInfo{FileOutput}) {
 100         print OUTFILE "$Line\n";
 101       }
 102       else {
 103         print "$Line\n";
 104       }
 105     }
 106   }
 107 }
 108 
 109 # List data for an nucleic acid...
 110 sub ListHeaderRowData {
 111   my($DataLabelRef) = @_;
 112   my($Line);
 113 
 114   # Format data...
 115   $Line = JoinWords($DataLabelRef, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 116   $Line =~ s/\://g;
 117   # List data...
 118   if ($OptionsInfo{FileOutput}) {
 119     print OUTFILE "$Line\n";
 120   }
 121   else {
 122     print "$Line\n";
 123   }
 124 }
 125 
 126 # List properties for nucleic acids...
 127 sub ListNucleicAcidProperties {
 128   my($NucleicAcidID, $NucleicAcidDataRef, $PropertyName, $PropertyValue, @PropertyLabels, @PropertyValues);
 129 
 130   print "Listing information for nucleic acid(s)...\n";
 131 
 132   if ($OptionsInfo{FileOutput}) {
 133     print "Generating file $OptionsInfo{OutFileName}...\n";
 134     open OUTFILE, ">$OptionsInfo{OutFileName}" or die "Couldn't open $OptionsInfo{OutFileName}: $!\n";
 135   }
 136 
 137   # Setup property labels...
 138   @PropertyLabels = ();
 139   for $PropertyName (@{$OptionsInfo{SpecifiedProperies}}) {
 140     push @PropertyLabels, ("$PropertyName:");
 141   }
 142 
 143   if ($OptionsInfo{NucleicAcidRowsOutput}) {
 144     ListHeaderRowData(\@PropertyLabels);
 145   }
 146 
 147   # Go over specified properties...
 148   for $NucleicAcidID (@{$OptionsInfo{SpecifiedNucleicAcidIDs}}) {
 149     $NucleicAcidDataRef = NucleicAcids::GetNucleicAcidPropertiesData($NucleicAcidID);
 150 
 151     if (!$OptionsInfo{NucleicAcidRowsOutput}) {
 152       if ($OptionsInfo{FileOutput}) {
 153         print OUTFILE "\nListing properties for nucleic acid $NucleicAcidID...\n\n";
 154       }
 155       else {
 156         print "\nListing properties for nucleic acid $NucleicAcidID...\n\n";
 157       }
 158     }
 159 
 160     # Collect data..
 161     @PropertyValues = ();
 162     for $PropertyName (@{$OptionsInfo{SpecifiedProperies}}) {
 163       $PropertyValue = $NucleicAcidDataRef->{$PropertyName};
 164       if (IsFloat($PropertyValue)) {
 165         $PropertyValue = sprintf("%.$OptionsInfo{Precision}f", $PropertyValue) + 0;
 166       }
 167       push @PropertyValues, $PropertyValue;
 168     }
 169     # List data...
 170     ListNucleicAcidData(\@PropertyLabels, \@PropertyValues);
 171   }
 172   if ($OptionsInfo{FileOutput}) {
 173     close OUTFILE;
 174   }
 175   print "\n";
 176 }
 177 
 178 # Get propery names from categories...
 179 sub GetPropertyNamesFromCategories {
 180   my($CategoryName) = @_;
 181   my(@PropertyNames);
 182 
 183   @PropertyNames = ();
 184   if ($CategoryName =~ /^Basic$/i) {
 185     @PropertyNames = ('Code', 'OtherCodes', 'Name', 'Type', 'MolecularFormula', 'MolecularWeight');
 186   } elsif ($CategoryName =~ /^BasicPlus$/i) {
 187     @PropertyNames = ('Code', 'OtherCodes', 'Name', 'Type', 'MolecularFormula', 'MolecularWeight', 'ExactMass', 'ElementalComposition');
 188   }
 189 
 190   return @PropertyNames;
 191 }
 192 
 193 # Process option values...
 194 sub ProcessOptions {
 195   %OptionsInfo = ();
 196 
 197   $OptionsInfo{Mode} = $Options{mode};
 198 
 199   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,");
 200   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
 201 
 202   $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
 203   $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef;
 204 
 205   $OptionsInfo{Output} = $Options{output};
 206   $OptionsInfo{OutputStyle} = $Options{outputstyle};
 207 
 208   $OptionsInfo{NucleicAcidRowsOutput} = ($Options{outputstyle} =~ /^NucleicAcidRows$/i) ? 1 : 0;
 209   $OptionsInfo{FileOutput} = ($Options{output} =~ /^File$/i) ? 1 : 0;
 210 
 211   $OptionsInfo{Precision} = $Options{precision};
 212 
 213   my($NucleicAcidID, @NucleicAcidIDs);
 214 
 215   @{$OptionsInfo{SpecifiedNucleicAcidIDs}} = ();
 216 
 217   # Set up Nucleic Acids IDs except for All mode...
 218   @NucleicAcidIDs = ();
 219 
 220   if (@ARGV >= 1) {
 221     push @NucleicAcidIDs, @ARGV;
 222   }
 223   else {
 224     # Setup mode specified default values...
 225     if ($Options{mode} =~ /NucleicAcidID/i) {
 226       push @NucleicAcidIDs, 'A';
 227     }
 228     elsif ($Options{mode} =~ /NucleicAcidType/i) {
 229       push @NucleicAcidIDs, 'Nucleoside';
 230     }
 231     else {
 232       push @NucleicAcidIDs, 'A';
 233     }
 234   }
 235 
 236   # Generate list of nucleic acids...
 237   if (@ARGV == 1 && $ARGV[0] =~ /^All$/i) {
 238     push @{$OptionsInfo{SpecifiedNucleicAcidIDs}}, NucleicAcids::GetNucleicAcids();
 239   }
 240   else {
 241     if ($Options{mode} =~ /NucleicAcidID/i) {
 242       ID: for $NucleicAcidID (@NucleicAcidIDs) {
 243         if (NucleicAcids::IsNucleicAcid($NucleicAcidID)) {
 244           push @{$OptionsInfo{SpecifiedNucleicAcidIDs}}, $NucleicAcidID;
 245         }
 246         else {
 247           warn "Ignoring nucleic acid ID, $NucleicAcidID, specified using command line parameter option: Unknown nucleic acid ID...\n";
 248           next ID;
 249         }
 250       }
 251     }
 252     elsif ($Options{mode} =~ /NucleicAcidType/i) {
 253       ID: for $NucleicAcidID (@NucleicAcidIDs) {
 254           if (!NucleicAcids::IsNucleicAcidType($NucleicAcidID)) {
 255             warn "Ignoring nucleic acid type, $NucleicAcidID, specified using command line parameter option: Unknown nucleic acid type...\n";
 256             next ID;
 257           }
 258           push @{$OptionsInfo{SpecifiedNucleicAcidIDs}}, NucleicAcids::GetNucleicAcidsByType($NucleicAcidID);
 259         }
 260       }
 261   }
 262   SetupSpecifiedProperties();
 263 
 264   # Setup output file name...
 265   $OptionsInfo{OutFileName} = '';
 266   if ($OptionsInfo{FileOutput}) {
 267     my($OutFileRoot, $OutFileExt);
 268 
 269     $OutFileRoot = '';
 270     $OutFileExt = "csv";
 271     if ($Options{outdelim} =~ /^tab$/i) {
 272       $OutFileExt = "tsv";
 273     }
 274     if ($Options{root}) {
 275       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 276       if ($RootFileName && $RootFileExt) {
 277         $OutFileRoot = $RootFileName;
 278       }
 279       else {
 280         $OutFileRoot = $Options{root};
 281       }
 282     }
 283     else {
 284       $OutFileRoot = 'NucleicAcidsInfo';
 285     }
 286     $OptionsInfo{OutFileName} = $OutFileRoot . '.' . $OutFileExt;
 287     if (!$Options{overwrite}) {
 288       if (-e $OptionsInfo{OutFileName}) {
 289         die "Error: Output file, $OptionsInfo{OutFileName}, already exists.\nUse \-o --overwrite\ option or specify a different name using \"-r --root\" option.\n";
 290       }
 291     }
 292   }
 293 }
 294 
 295 # Setup properties to list...
 296 sub SetupSpecifiedProperties {
 297 
 298   $OptionsInfo{Properties} = defined $Options{properties} ? $Options{properties} : undef;
 299 
 300   $OptionsInfo{PropertiesMode} = $Options{propertiesmode};
 301   $OptionsInfo{PropertiesListing} = $Options{propertieslisting};
 302 
 303   # Make sure appropriate properties/category names are specified...
 304   @{$OptionsInfo{SpecifiedProperies}} = ();
 305   if ($Options{properties} && ($Options{propertiesmode} =~ /^All$/i) ) {
 306     warn "Warning: Ignoring values specifed by \"-p --properties\" option: Not valid for All value of \"--propertiesmode\" option...\n";
 307   }
 308   if ($Options{propertiesmode} =~ /^All$/i) {
 309     if ($Options{propertieslisting} =~ /^Alphabetical$/i) {
 310       push @{$OptionsInfo{SpecifiedProperies}}, NucleicAcids::GetNucleicAcidPropertiesNames('Alphabetical');
 311     }
 312     else {
 313       push @{$OptionsInfo{SpecifiedProperies}}, NucleicAcids::GetNucleicAcidPropertiesNames();
 314     }
 315   }
 316   else {
 317     if ($Options{properties}) {
 318       if ($Options{propertiesmode} =~ /^Categories$/i) {
 319         # Check category name...
 320         if ($Options{properties} !~ /^(Basic|BasicPlus)$/i) {
 321           die "Error: The value specified, $Options{properties}, for option \"-p --properties\" in conjunction with \"Categories\" value for option \"--propertiesmode\" is not valid. Allowed values: Basic and BasicPlus\n";
 322         }
 323         # Set propertynames...
 324         push @{$OptionsInfo{SpecifiedProperies}}, GetPropertyNamesFromCategories($Options{properties});
 325       }
 326       else {
 327         # Check property names..
 328         my($Name, $PropertyName, @Names);
 329         @Names = split /\,/, $Options{properties};
 330         NAME: for $Name (@Names) {
 331           $PropertyName = RemoveLeadingAndTrailingWhiteSpaces($Name);
 332           if (NucleicAcids::IsNucleicAcidProperty($PropertyName)) {
 333             push @{$OptionsInfo{SpecifiedProperies}}, $PropertyName;
 334           }
 335           else {
 336             warn "Warning: Ignoring value, $Name, specifed by \"-p --properties\" option: Unknown property name...\n";
 337           }
 338         }
 339         if ($Options{propertieslisting} =~ /^Alphabetical$/i) {
 340           # Code, OtherCodes and Name are always listed first...
 341           my($CodePresent, $OtherCodesPresent, $NamePresent,  @AlphabeticalProperties, %PropertiesMap);
 342           %PropertiesMap = ();
 343           @AlphabeticalProperties = ();
 344           $CodePresent = 0; $OtherCodesPresent = 0; $NamePresent = 0;
 345           NAME: for $Name (@{$OptionsInfo{SpecifiedProperies}}) {
 346             if ($Name =~ /^Code$/i) {
 347               $CodePresent = 1;
 348               next NAME;
 349             }
 350             if ($Name =~ /^OtherCodes$/i) {
 351               $OtherCodesPresent = 1;
 352               next NAME;
 353             }
 354             if ($Name =~ /^Name$/i) {
 355               $NamePresent = 1;
 356               next NAME;
 357             }
 358             $PropertiesMap{$Name} = $Name;
 359           }
 360           # Setup the alphabetical list...
 361           if ($CodePresent) {
 362             push @AlphabeticalProperties, 'Code';
 363           }
 364           if ($OtherCodesPresent) {
 365             push @AlphabeticalProperties, 'OtherCodesPresent';
 366           }
 367           if ($NamePresent) {
 368             push @AlphabeticalProperties, 'Name';
 369           }
 370           for $Name (sort keys %PropertiesMap) {
 371             push @AlphabeticalProperties, $Name;
 372           }
 373           @{$OptionsInfo{SpecifiedProperies}} = ();
 374           push @{$OptionsInfo{SpecifiedProperies}}, @AlphabeticalProperties;
 375         }
 376       }
 377     }
 378     else {
 379       # Set default value...
 380       push @{$OptionsInfo{SpecifiedProperies}}, GetPropertyNamesFromCategories('Basic');
 381     }
 382   }
 383 }
 384 
 385 # Setup script usage  and retrieve command line arguments specified using various options...
 386 sub SetupScriptUsage {
 387 
 388   # Retrieve all the options...
 389   %Options = ();
 390   $Options{mode} = "NucleicAcidID";
 391   $Options{outdelim} = "comma";
 392   $Options{output} = "STDOUT";
 393   $Options{outputstyle} = "NucleicAcidBlock";
 394   $Options{precision} = 4;
 395   $Options{propertiesmode} = "Categories";
 396   $Options{propertieslisting} = "ByGroup";
 397   $Options{quote} = "yes";
 398 
 399   if (!GetOptions(\%Options, "help|h", "mode|m=s", "outdelim=s", "output=s", "outputstyle=s", "overwrite|o", "precision=i", "properties|p=s", "propertieslisting=s", "propertiesmode=s", "quote|q=s", "root|r=s", "workingdir|w=s")) {
 400     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 401   }
 402   if ($Options{workingdir}) {
 403     if (! -d $Options{workingdir}) {
 404       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 405     }
 406     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 407   }
 408   if ($Options{mode} !~ /^(NucleicAcidID|NucleicAcidType)$/i) {
 409     die "Error: The value specified, $Options{mode}, for option \"--mode\" is not valid. Allowed values: NucleicAcidID or NucleicAcidType\n";
 410   }
 411   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 412     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 413   }
 414   if ($Options{output} !~ /^(STDOUT|File)$/i) {
 415     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: STDOUT or File\n";
 416   }
 417   if ($Options{outputstyle} !~ /^(NucleicAcidBlock|NucleicAcidRows)$/i) {
 418     die "Error: The value specified, $Options{outputstyle}, for option \"--outputstyle\" is not valid. Allowed values: NucleicAcidBlock or NucleicAcidRows\n";
 419   }
 420   if (!IsPositiveInteger($Options{precision})) {
 421     die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
 422   }
 423   if ($Options{propertiesmode} !~ /^(Categories|Names|All)$/i) {
 424     die "Error: The value specified, $Options{propertiesmode}, for option \"--propertiesmode\" is not valid. Allowed values: Categories, Names, or All\n";
 425   }
 426   if ($Options{propertieslisting} !~ /^(ByGroup|Alphabetical)$/i) {
 427     die "Error: The value specified, $Options{propertieslisting}, for option \"--propertieslisting\" is not valid. Allowed values: ByGroup, or Alphabetical\n";
 428   }
 429   if ($Options{quote} !~ /^(yes|no)$/i) {
 430     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 431   }
 432 }
 433