1 #!/usr/bin/perl -w 2 # 3 # File: InfoAminoAcids.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use TextUtil; 34 use AminoAcids; 35 36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 37 38 # Autoflush STDOUT 39 $| = 1; 40 41 # Starting message... 42 $ScriptName = basename($0); 43 print "\n$ScriptName: Starting...\n\n"; 44 $StartTime = new Benchmark; 45 46 # Get the options and setup script... 47 SetupScriptUsage(); 48 if ($Options{help}) { 49 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 50 } 51 52 print "Processing options...\n"; 53 my(%OptionsInfo); 54 ProcessOptions(); 55 56 ListAminoAcidProperties(); 57 print "\n$ScriptName:Done...\n\n"; 58 59 $EndTime = new Benchmark; 60 $TotalTime = timediff ($EndTime, $StartTime); 61 print "Total time: ", timestr($TotalTime), "\n"; 62 63 ############################################################################### 64 65 # List data for an amino acid... 66 sub ListAminoAcidData { 67 my($DataLabelRef, $DataValueRef) = @_; 68 my($Index, $Line, $Value); 69 70 if ($OptionsInfo{AminoAcidRowsOutput}) { 71 $Line = ''; 72 # Format data... 73 if ($OptionsInfo{OutQuote} || $Options{outdelim} !~ /^comma$/i) { 74 $Line = JoinWords($DataValueRef, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 75 } 76 else { 77 # Always quote values containing commas... 78 $Line = ($DataValueRef->[0] =~ /\,/) ? qq("$DataValueRef->[0]") : $DataValueRef->[0]; 79 for $Index (1 .. $#{$DataValueRef} ) { 80 $Value = $DataValueRef->[$Index]; 81 if ($Value =~ /\,/) { 82 $Value = qq("$Value"); 83 } 84 $Line .= $OptionsInfo{OutDelim} . $Value; 85 } 86 } 87 if ($OptionsInfo{FileOutput}) { 88 print OUTFILE "$Line\n"; 89 } 90 else { 91 print "$Line\n"; 92 } 93 } 94 else { 95 # Format and list data... 96 $Line = ''; 97 for $Index (0 .. $#{$DataLabelRef} ) { 98 $Line = $DataLabelRef->[$Index] . ' ' . $DataValueRef->[$Index]; 99 if ($OptionsInfo{FileOutput}) { 100 print OUTFILE "$Line\n"; 101 } 102 else { 103 print "$Line\n"; 104 } 105 } 106 } 107 } 108 109 # List data for an amino acid... 110 sub ListHeaderRowData { 111 my($DataLabelRef) = @_; 112 my($Line); 113 114 # Format data... 115 $Line = JoinWords($DataLabelRef, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 116 $Line =~ s/\://g; 117 # List data... 118 if ($OptionsInfo{FileOutput}) { 119 print OUTFILE "$Line\n"; 120 } 121 else { 122 print "$Line\n"; 123 } 124 } 125 126 # List properties for amino acids... 127 sub ListAminoAcidProperties { 128 my($AminoAcidID, $AminoAcidDataRef, $PropertyName, $PropertyValue, @PropertyLabels, @PropertyValues); 129 130 print "Listing information for amino acid(s)...\n"; 131 132 if ($OptionsInfo{FileOutput}) { 133 print "Generating file $OptionsInfo{OutFileName}...\n"; 134 open OUTFILE, ">$OptionsInfo{OutFileName}" or die "Couldn't open $OptionsInfo{OutFileName}: $!\n"; 135 } 136 137 # Setup property labels... 138 @PropertyLabels = (); 139 for $PropertyName (@{$OptionsInfo{SpecifiedProperies}}) { 140 push @PropertyLabels, ("$PropertyName:"); 141 } 142 143 if ($OptionsInfo{AminoAcidRowsOutput}) { 144 ListHeaderRowData(\@PropertyLabels); 145 } 146 147 # Go over specified properties... 148 for $AminoAcidID (@{$OptionsInfo{SpecifiedAminoAcidIDs}}) { 149 $AminoAcidDataRef = AminoAcids::GetAminoAcidPropertiesData($AminoAcidID); 150 151 if (!$OptionsInfo{AminoAcidRowsOutput}) { 152 if ($OptionsInfo{FileOutput}) { 153 print OUTFILE "\nListing properties for amino acid $AminoAcidID...\n\n"; 154 } 155 else { 156 print "\nListing properties for amino acid $AminoAcidID...\n\n"; 157 } 158 } 159 160 # Collect data.. 161 @PropertyValues = (); 162 for $PropertyName (@{$OptionsInfo{SpecifiedProperies}}) { 163 $PropertyValue = $AminoAcidDataRef->{$PropertyName}; 164 if (IsFloat($PropertyValue)) { 165 $PropertyValue = sprintf("%.$OptionsInfo{Precision}f", $PropertyValue) + 0; 166 } 167 push @PropertyValues, $PropertyValue; 168 } 169 # List data... 170 ListAminoAcidData(\@PropertyLabels, \@PropertyValues); 171 } 172 if ($OptionsInfo{FileOutput}) { 173 close OUTFILE; 174 } 175 print "\n"; 176 } 177 178 # Get propery names from categories... 179 sub GetPropertyNamesFromCategories { 180 my($CategoryName) = @_; 181 my(@PropertyNames); 182 183 @PropertyNames = (); 184 if ($CategoryName =~ /^Basic$/i) { 185 @PropertyNames = ('ThreeLetterCode', 'OneLetterCode', 'AminoAcid', 'DNACodons', 'RNACodons', 'ChemicalFormula','MolecularWeight', 'LinearStructure', 'LinearStructureAtpH7.4'); 186 } elsif ($CategoryName =~ /^BasicPlus$/i) { 187 @PropertyNames = ('ThreeLetterCode', 'OneLetterCode', 'AminoAcid', 'DNACodons', 'RNACodons', 'AcidicBasic', 'PolarNonpolar', 'Charged', 'Aromatic', 'HydrophobicHydophilic', 'IsoelectricPoint', 'pKCOOH', 'pKNH3+', 'ChemicalFormula', 'MolecularWeight', 'ExactMass', 'ChemicalFormulaMinusH2O', 'MolecularWeightMinusH2O(18.01524)', 'ExactMassMinusH2O(18.01056)','LinearStructure', 'LinearStructureAtpH7.4'); 188 } elsif ($CategoryName =~ /^BasicAndHydrophobicity$/i) { 189 @PropertyNames = ('ThreeLetterCode', 'OneLetterCode', 'AminoAcid', 'DNACodons', 'RNACodons', 'ChemicalFormula', 'MolecularWeight', 'LinearStructure', 'LinearStructureAtpH7.4', 'HydrophobicityEisenbergAndOthers', 'HydrophobicityHoppAndWoods', 'HydrophobicityJanin', 'HydrophobicityKyteAndDoolittle', 'HydrophobicityRoseAndOthers', 'HydrophobicityWolfendenAndOthers'); 190 } elsif ($CategoryName =~ /^BasicAndHydrophobicityPlus$/i) { 191 @PropertyNames = ('ThreeLetterCode', 'OneLetterCode', 'AminoAcid', 'DNACodons', 'RNACodons', 'ChemicalFormula', 'MolecularWeight', 'LinearStructure', 'LinearStructureAtpH7.4', 'HydrophobicityAbrahamAndLeo', 'HydrophobicityBlack', 'HydrophobicityBullAndBreese', 'HydrophobicityChothia', 'HydrophobicityEisenbergAndOthers', 'HydrophobicityFauchereAndOthers', 'HydrophobicityGuy', 'HydrophobicityHPLCAtpH3.4Cowan', 'HydrophobicityHPLCAtpH7.5Cowan', 'HydrophobicityHPLCParkerAndOthers', 'HydrophobicityHPLCWilsonAndOthers', 'HydrophobicityHoppAndWoods', 'HydrophobicityJanin', 'HydrophobicityKyteAndDoolittle', 'HydrophobicityManavalanAndOthers', 'HydrophobicityMiyazawaAndOthers', 'HydrophobicityOMHSweetAndOthers', 'HydrophobicityRaoAndArgos', 'HydrophobicityRfMobility', 'HydrophobicityRoseAndOthers', 'HydrophobicityRoseman', 'HydrophobicityWellingAndOthers', 'HydrophobicityWolfendenAndOthers'); 192 } 193 194 return @PropertyNames; 195 } 196 197 # Process option values... 198 sub ProcessOptions { 199 %OptionsInfo = (); 200 201 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,"); 202 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; 203 204 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; 205 $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef; 206 207 $OptionsInfo{OutputStyle} = $Options{outputstyle}; 208 209 $OptionsInfo{AminoAcidRowsOutput} = ($Options{outputstyle} =~ /^AminoAcidRows$/i) ? 1 : 0; 210 $OptionsInfo{FileOutput} = ($Options{output} =~ /^File$/i) ? 1 : 0; 211 212 $OptionsInfo{Precision} = $Options{precision}; 213 214 my($AminoAcidID, @AminoAcidIDs); 215 216 @{$OptionsInfo{SpecifiedAminoAcidIDs}} = (); 217 218 # Set up Amino Acids IDs except for All mode... 219 @AminoAcidIDs = (); 220 221 if (@ARGV >= 1) { 222 push @AminoAcidIDs, @ARGV; 223 } 224 else { 225 # Setup mode specified default values... 226 push @AminoAcidIDs, 'Ala'; 227 } 228 229 # Generate list of amino acids... 230 if (@ARGV == 1 && $ARGV[0] =~ /^All$/i) { 231 push @{$OptionsInfo{SpecifiedAminoAcidIDs}}, AminoAcids::GetAminoAcids(); 232 } 233 else { 234 ID: for $AminoAcidID (@AminoAcidIDs) { 235 if (AminoAcids::IsAminoAcid($AminoAcidID)) { 236 push @{$OptionsInfo{SpecifiedAminoAcidIDs}}, $AminoAcidID; 237 } 238 else { 239 warn "Ignoring amino acid ID, $AminoAcidID, specified using command line parameter option: Unknown amino acid ID...\n"; 240 next ID; 241 } 242 } 243 } 244 SetupSpecifiedProperties(); 245 246 # Setup output file name... 247 $OptionsInfo{OutFileName} = ''; 248 if ($OptionsInfo{FileOutput}) { 249 my($OutFileRoot, $OutFileExt); 250 251 $OutFileRoot = ''; 252 $OutFileExt = "csv"; 253 if ($Options{outdelim} =~ /^tab$/i) { 254 $OutFileExt = "tsv"; 255 } 256 if ($Options{root}) { 257 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 258 if ($RootFileName && $RootFileExt) { 259 $OutFileRoot = $RootFileName; 260 } 261 else { 262 $OutFileRoot = $Options{root}; 263 } 264 } 265 else { 266 $OutFileRoot = 'AminoAcidsInfo'; 267 } 268 $OptionsInfo{OutFileName} = $OutFileRoot . '.' . $OutFileExt; 269 if (!$Options{overwrite}) { 270 if (-e $OptionsInfo{OutFileName}) { 271 die "Error: Output file, $OptionsInfo{OutFileName}, already exists.\nUse \-o --overwrite\ option or specify a different name using \"-r --root\" option.\n"; 272 } 273 } 274 } 275 } 276 277 # Setup properties to list... 278 sub SetupSpecifiedProperties { 279 280 $OptionsInfo{Properties} = defined $Options{properties} ? $Options{properties} : undef; 281 282 $OptionsInfo{PropertiesMode} = $Options{propertiesmode}; 283 $OptionsInfo{PropertiesListing} = $Options{propertieslisting}; 284 285 # Make sure appropriate properties/category names are specified... 286 @{$OptionsInfo{SpecifiedProperies}} = (); 287 if ($Options{properties} && ($Options{propertiesmode} =~ /^All$/i) ) { 288 warn "Warning: Ignoring values specifed by \"-p --properties\" option: Not valid for All value of \"--propertiesmode\" option...\n"; 289 } 290 if ($Options{propertiesmode} =~ /^All$/i) { 291 if ($Options{propertieslisting} =~ /^Alphabetical$/i) { 292 push @{$OptionsInfo{SpecifiedProperies}}, AminoAcids::GetAminoAcidPropertiesNames('Alphabetical'); 293 } 294 else { 295 push @{$OptionsInfo{SpecifiedProperies}}, AminoAcids::GetAminoAcidPropertiesNames(); 296 } 297 } 298 else { 299 if ($Options{properties}) { 300 if ($Options{propertiesmode} =~ /^Categories$/i) { 301 # Check category name... 302 if ($Options{properties} !~ /^(Basic|BasicPlus|BasicAndHydrophobicity|BasicAndHydrophobicityPlus)$/i) { 303 die "Error: The value specified, $Options{properties}, for option \"-p --properties\" in conjunction with \"Categories\" value for option \"--propertiesmode\" is not valid. Allowed values: Basic, BasicPlus, BasicAndHydrophobicity, and BasicAndHydrophobicityPlus\n"; 304 } 305 # Set propertynames... 306 push @{$OptionsInfo{SpecifiedProperies}}, GetPropertyNamesFromCategories($Options{properties}); 307 } 308 else { 309 # Check property names.. 310 my($Name, $PropertyName, @Names); 311 @Names = split /\,/, $Options{properties}; 312 NAME: for $Name (@Names) { 313 $PropertyName = RemoveLeadingAndTrailingWhiteSpaces($Name); 314 if (AminoAcids::IsAminoAcidProperty($PropertyName)) { 315 push @{$OptionsInfo{SpecifiedProperies}}, $PropertyName; 316 } 317 else { 318 warn "Warning: Ignoring value, $Name, specifed by \"-p --properties\" option: Unknown property name...\n"; 319 } 320 } 321 if ($Options{propertieslisting} =~ /^Alphabetical$/i) { 322 # ThreeLetterCode, OneLetterCode and AminoAcid are always listed first... 323 # NaturalIsotopeData in the end... 324 my($OneLetterCodePresent, $ThreeLetterCodePresent, $AminoAcidPresent, @AlphabeticalProperties, %PropertiesMap); 325 %PropertiesMap = (); 326 @AlphabeticalProperties = (); 327 $OneLetterCodePresent = 0; $ThreeLetterCodePresent = 0; $AminoAcidPresent = 0; 328 NAME: for $Name (@{$OptionsInfo{SpecifiedProperies}}) { 329 if ($Name =~ /^OneLetterCode$/i) { 330 $OneLetterCodePresent = 1; 331 next NAME; 332 } 333 if ($Name =~ /^ThreeLetterCode$/i) { 334 $ThreeLetterCodePresent = 1; 335 next NAME; 336 } 337 if ($Name =~ /^AminoAcid$/i) { 338 $AminoAcidPresent = 1; 339 next NAME; 340 } 341 $PropertiesMap{$Name} = $Name; 342 } 343 # Setup the alphabetical list... 344 if ($ThreeLetterCodePresent) { 345 push @AlphabeticalProperties, 'ThreeLetterCode'; 346 } 347 if ($OneLetterCodePresent) { 348 push @AlphabeticalProperties, 'OneLetterCode'; 349 } 350 if ($AminoAcidPresent) { 351 push @AlphabeticalProperties, 'AminoAcid'; 352 } 353 for $Name (sort keys %PropertiesMap) { 354 push @AlphabeticalProperties, $Name; 355 } 356 @{$OptionsInfo{SpecifiedProperies}} = (); 357 push @{$OptionsInfo{SpecifiedProperies}}, @AlphabeticalProperties; 358 } 359 } 360 } 361 else { 362 # Set default value... 363 push @{$OptionsInfo{SpecifiedProperies}}, GetPropertyNamesFromCategories('Basic'); 364 } 365 } 366 } 367 368 # Setup script usage and retrieve command line arguments specified using various options... 369 sub SetupScriptUsage { 370 371 # Retrieve all the options... 372 %Options = (); 373 $Options{outdelim} = "comma"; 374 $Options{output} = "STDOUT"; 375 $Options{outputstyle} = "AminoAcidBlock"; 376 $Options{precision} = 4; 377 $Options{propertiesmode} = "Categories"; 378 $Options{propertieslisting} = "ByGroup"; 379 $Options{quote} = "yes"; 380 381 if (!GetOptions(\%Options, "help|h", "outdelim=s", "output=s", "outputstyle=s", "overwrite|o", "precision=i", "properties|p=s", "propertieslisting=s", "propertiesmode=s", "quote|q=s", "root|r=s", "workingdir|w=s")) { 382 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 383 } 384 if ($Options{workingdir}) { 385 if (! -d $Options{workingdir}) { 386 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 387 } 388 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 389 } 390 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 391 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 392 } 393 if ($Options{output} !~ /^(STDOUT|File)$/i) { 394 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: STDOUT or File\n"; 395 } 396 if ($Options{outputstyle} !~ /^(AminoAcidBlock|AminoAcidRows)$/i) { 397 die "Error: The value specified, $Options{outputstyle}, for option \"--outputstyle\" is not valid. Allowed values: AminoAcidBlock or AminoAcidRows\n"; 398 } 399 if (!IsPositiveInteger($Options{precision})) { 400 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n"; 401 } 402 if ($Options{propertiesmode} !~ /^(Categories|Names|All)$/i) { 403 die "Error: The value specified, $Options{propertiesmode}, for option \"--propertiesmode\" is not valid. Allowed values: Categories, Names, or All\n"; 404 } 405 if ($Options{propertieslisting} !~ /^(ByGroup|Alphabetical)$/i) { 406 die "Error: The value specified, $Options{propertieslisting}, for option \"--propertieslisting\" is not valid. Allowed values: ByGroup, or Alphabetical\n"; 407 } 408 if ($Options{quote} !~ /^(yes|no)$/i) { 409 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 410 } 411 } 412