MayaChemTools

   1 package NucleicAcids;
   2 #
   3 # File: NucleicAcids.pm
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use Carp;
  28 use Text::ParseWords;
  29 use TextUtil;
  30 use FileUtil;
  31 
  32 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  33 
  34 @ISA = qw(Exporter);
  35 @EXPORT = qw();
  36 @EXPORT_OK = qw(GetNucleicAcids GetNucleicAcidsByType GetNucleicAcidPropertiesData GetNucleicAcidPropertiesNames IsNucleicAcid IsNucleicAcidProperty IsNucleicAcidType);
  37 
  38 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  39 
  40 #
  41 # Load nucleic acids data...
  42 #
  43 my(%NucleicAcidDataMap, %NucleicAcidCodeMap, %NucleicAcidOtherCodeMap, %NucleicAcidNameMap, @NucleicAcidCodes, @NucleicAcidPropertyNames, %NucleicAcidPropertyNamesMap, %NucleicAcidTypesMap);
  44 _LoadNucleicAcidsData();
  45 
  46 #
  47 # Get a list of all known nucleic acids as one of these values:
  48 # code or nucleic acid name...
  49 #
  50 sub GetNucleicAcids {
  51   my($NameType, $Code, $Name, @NucleicAcidNames);
  52 
  53   $NameType = 'Code';
  54   if (@_ >= 1) {
  55     ($NameType) = @_;
  56   }
  57 
  58   # Collect names...
  59   @NucleicAcidNames = ();
  60   for $Code (@NucleicAcidCodes) {
  61     NAME : {
  62       if ($NameType =~ /^Name$/i) {$Name = $NucleicAcidDataMap{$Code}{Name}; last NAME; }
  63       $Name = $Code;
  64     }
  65     push @NucleicAcidNames, $Name;
  66   }
  67 
  68   return (wantarray ? @NucleicAcidNames : \@NucleicAcidNames);
  69 }
  70 
  71 #
  72 # Get a list of all known nucleic acids by one of these specified types:
  73 # Nucleobase, Nucleoside, Deoxynucleoside, Nucleotide, Deoxynucleotide. Default: Nucleoside
  74 #
  75 sub GetNucleicAcidsByType {
  76   my($NameType, $Type, $Code, $Name, @NucleicAcidNames);
  77 
  78   $Type = 'Nucleoside';
  79   $NameType = 'Code';
  80   if (@_ == 2) {
  81     ($Type, $NameType) = @_;
  82   }
  83   elsif (@_ == 1) {
  84     ($Type) = @_;
  85   }
  86 
  87   # Collect names...
  88   @NucleicAcidNames = ();
  89   CODE: for $Code (@NucleicAcidCodes) {
  90     if ($NucleicAcidDataMap{$Code}{Type} !~ /^$Type$/i ) {
  91       next CODE;
  92     }
  93     NAME : {
  94       if ($NameType =~ /^Name$/i) {$Name = $NucleicAcidDataMap{$Code}{Name}; last NAME; }
  95       $Name = $Code;
  96     }
  97     push @NucleicAcidNames, $Name;
  98   }
  99 
 100   return (wantarray ? @NucleicAcidNames : \@NucleicAcidNames);
 101 }
 102 
 103 #
 104 # Get all available properties data for an nucleic acid using any of these symbols:
 105 # code, other code or name.
 106 #
 107 # A reference to a hash array is returned with keys and values representing property
 108 # name and its values respectively.
 109 #
 110 sub GetNucleicAcidPropertiesData {
 111   my($NucleicAcidID) = @_;
 112   my($Code);
 113 
 114   if ($Code = _ValidateNucleicAcidID($NucleicAcidID)) {
 115     return \%{$NucleicAcidDataMap{$Code}};
 116   }
 117   else {
 118     return undef;
 119   }
 120 }
 121 
 122 #
 123 # Get names of all available nucleic acid properties. A reference to  an array containing
 124 # names of all available properties is returned.
 125 #
 126 sub GetNucleicAcidPropertiesNames {
 127   my($Mode);
 128   my($PropertyName, @PropertyNames);
 129 
 130   $Mode = 'ByGroup';
 131   if (@_ == 1) {
 132     ($Mode) = @_;
 133   }
 134 
 135   @PropertyNames = ();
 136   if ($Mode =~ /^Alphabetical$/i) {
 137     my($PropertyName);
 138     # Code, OtherCodes and Name are always listed first...
 139     push @PropertyNames, qw(Code OtherCodes Name);
 140     for $PropertyName (sort keys %NucleicAcidPropertyNamesMap) {
 141       if ($PropertyName !~ /^(Code|OtherCodes|Name)$/) {
 142         push @PropertyNames, $PropertyName;
 143       }
 144     }
 145   }
 146   else {
 147     push @PropertyNames, @NucleicAcidPropertyNames;
 148   }
 149   return (wantarray ? @PropertyNames : \@PropertyNames);
 150 }
 151 
 152 #
 153 # Is it a known nucleic acid? Input is either a code or a name
 154 #
 155 sub IsNucleicAcid {
 156   my($NucleicAcidID) = @_;
 157   my($Status);
 158 
 159   $Status = (_ValidateNucleicAcidID($NucleicAcidID)) ? 1 : 0;
 160 
 161   return $Status;
 162 }
 163 
 164 #
 165 # Is it an available nucleic acid property?
 166 #
 167 sub IsNucleicAcidProperty {
 168   my($PropertyName) = @_;
 169   my($Status);
 170 
 171   $Status = (exists($NucleicAcidPropertyNamesMap{$PropertyName})) ? 1 : 0;
 172 
 173   return $Status;
 174 }
 175 
 176 #
 177 # Is it an available nucleic acid type?
 178 #
 179 sub IsNucleicAcidType {
 180   my($Type) = @_;
 181   my($Status);
 182 
 183   $Status = (exists($NucleicAcidTypesMap{lc($Type)})) ? 1 : 0;
 184 
 185   return $Status;
 186 }
 187 
 188 #
 189 # Implents GetNucleicAcid<PropertyName> for a valid proprty name.
 190 #
 191 sub AUTOLOAD {
 192   my($NucleicAcidID) = @_;
 193   my($FunctionName, $PropertyName, $PropertyValue, $Code);
 194 
 195   $PropertyValue = undef;
 196 
 197   use vars qw($AUTOLOAD);
 198   $FunctionName = $AUTOLOAD;
 199   $FunctionName =~ s/.*:://;
 200 
 201   # Only Get<PropertyName> functions are supported...
 202   if ($FunctionName !~ /^Get/) {
 203     croak "Error: Function, NucleicAcid::$FunctionName, is not supported by AUTOLOAD in NucleicAcid module: Only Get<PropertyName> functions are implemented...";
 204   }
 205 
 206   $PropertyName = $FunctionName;
 207   $PropertyName =~  s/^GetNucleicAcid//;
 208   if (!exists $NucleicAcidPropertyNamesMap{$PropertyName}) {
 209     croak "Error: Function, NucleicAcid::$FunctionName, is not supported by AUTOLOAD in NucleicAcid module: Unknown nucleic acid property name, $PropertyName, specified...";
 210   }
 211 
 212   if (!($Code = _ValidateNucleicAcidID($NucleicAcidID))) {
 213     return undef;
 214   }
 215   $PropertyValue = $NucleicAcidDataMap{$Code}{$PropertyName};
 216   return $PropertyValue;
 217 }
 218 
 219 #
 220 # Load NucleicAcidsData.csv files from <MayaChemTools>/lib directory...
 221 #
 222 sub _LoadNucleicAcidsData {
 223   my($NucleicAcidsDataFile, $MayaChemToolsLibDir);
 224 
 225   $MayaChemToolsLibDir = GetMayaChemToolsLibDirName();
 226 
 227   $NucleicAcidsDataFile =  "$MayaChemToolsLibDir" . "/data/NucleicAcidsData.csv";
 228 
 229   if (! -e "$NucleicAcidsDataFile") {
 230     croak "Error: MayaChemTools package file, $NucleicAcidsDataFile, is missing: Possible installation problems...";
 231   }
 232 
 233   _LoadData($NucleicAcidsDataFile);
 234 }
 235 
 236 #
 237 # Load NucleicAcidsData.csv file from <MayaChemTools>/lib directory...
 238 #
 239 sub _LoadData {
 240   my($NucleicAcidsDataFile) = @_;
 241 
 242   %NucleicAcidDataMap = ();
 243   @NucleicAcidCodes = ();
 244   @NucleicAcidPropertyNames = ();
 245   %NucleicAcidPropertyNamesMap = ();
 246   %NucleicAcidCodeMap = ();
 247   %NucleicAcidOtherCodeMap = ();
 248   %NucleicAcidNameMap = ();
 249   %NucleicAcidTypesMap = ();
 250 
 251   # Load property data for all nucleic acids...
 252   #
 253   # File Format:
 254   # "Code","OtherCodes","BasePair","Name","Type","ChemicalFormula","ChemicalFormulaAtpH7.5","MolecularWeight","ExactMass","ElementalComposition"
 255   #
 256   my($Code, $OtherCodes, $NucleicAcidName, $Line, $NumOfCols, $InDelim, $Index, $Name, $Value, $Units, @LineWords, @ColLabels);
 257 
 258   $InDelim = "\,";
 259   open NUCLEICACIDSDATAFILE, "$NucleicAcidsDataFile" or croak "Couldn't open $NucleicAcidsDataFile: $! ...";
 260 
 261   # Skip lines up to column labels...
 262   LINE: while ($Line = GetTextLine(\*NUCLEICACIDSDATAFILE)) {
 263     if ($Line !~ /^#/) {
 264       last LINE;
 265     }
 266   }
 267   @ColLabels= quotewords($InDelim, 0, $Line);
 268   $NumOfCols = @ColLabels;
 269 
 270   # Extract property names from column labels...
 271   @NucleicAcidPropertyNames = ();
 272   for $Index (0 .. $#ColLabels) {
 273     $Name = $ColLabels[$Index];
 274     push @NucleicAcidPropertyNames, $Name;
 275 
 276     # Store property names...
 277     $NucleicAcidPropertyNamesMap{$Name} = $Name;
 278   }
 279 
 280   # Process nucleic acid data...
 281   LINE: while ($Line = GetTextLine(\*NUCLEICACIDSDATAFILE)) {
 282     if ($Line =~ /^#/) {
 283       next LINE;
 284     }
 285     @LineWords = ();
 286     @LineWords = quotewords($InDelim, 0, $Line);
 287     if (@LineWords != $NumOfCols) {
 288       croak "Error: The number of data fields, @LineWords, in $NucleicAcidsDataFile must be $NumOfCols.\nLine: $Line...";
 289     }
 290     $Code = $LineWords[0]; $OtherCodes = $LineWords[1]; $NucleicAcidName = $LineWords[3];
 291     if (exists $NucleicAcidDataMap{$Code}) {
 292       carp "Warning: Ignoring data for nucleic acid $Code: It has already been loaded.\nLine: $Line....";
 293       next LINE;
 294     }
 295 
 296     # Store all the values...
 297     push @NucleicAcidCodes, $Code;
 298     %{$NucleicAcidDataMap{$Code}} = ();
 299     for $Index (0 .. $#LineWords) {
 300       $Name = $NucleicAcidPropertyNames[$Index];
 301       $Value = $LineWords[$Index];
 302       $NucleicAcidDataMap{$Code}{$Name} = $Value;
 303     }
 304   }
 305   close NUCLEICACIDSDATAFILE;
 306 
 307   # Setup one letter and nucleic acid name maps...
 308   _SetupNucleicAcidIDMap();
 309 }
 310 
 311 #
 312 # Setup lowercase other codes and name maps pointing
 313 # to code as show in data file.
 314 #
 315 sub _SetupNucleicAcidIDMap {
 316   my($Code, @OtherCodes, $OtherCode, $NucleicAcidName, $NucleicAcidType);
 317 
 318   %NucleicAcidCodeMap = ();
 319   %NucleicAcidOtherCodeMap = ();
 320   %NucleicAcidNameMap = ();
 321   %NucleicAcidTypesMap = ();
 322 
 323   for $Code (keys %NucleicAcidDataMap) {
 324     $NucleicAcidCodeMap{lc($Code)} = $Code;
 325 
 326     $NucleicAcidName = $NucleicAcidDataMap{$Code}{Name};
 327     $NucleicAcidNameMap{lc($NucleicAcidName)} = $Code;
 328 
 329     $NucleicAcidType = $NucleicAcidDataMap{$Code}{Type};
 330     if (! exists $NucleicAcidTypesMap{$NucleicAcidType}) {
 331       $NucleicAcidTypesMap{lc($NucleicAcidType)} = $NucleicAcidType;
 332     }
 333 
 334     @OtherCodes = split /\,/, $NucleicAcidDataMap{$Code}{OtherCodes};
 335     OTHERCODE: for $OtherCode (@OtherCodes) {
 336       if (!$OtherCode) {
 337         next OTHERCODE;
 338       }
 339       $OtherCode = RemoveLeadingAndTrailingWhiteSpaces($OtherCode);
 340       $NucleicAcidOtherCodeMap{lc($OtherCode)} = $Code;
 341     }
 342   }
 343 }
 344 
 345 # Validate Nucleic acid ID...
 346 sub _ValidateNucleicAcidID {
 347   my($NucleicAcidID) = @_;
 348   my($Code) = undef;
 349 
 350   if (exists $NucleicAcidCodeMap{lc($NucleicAcidID)}) {
 351     $Code = $NucleicAcidCodeMap{lc($NucleicAcidID)};
 352   }
 353   elsif (exists $NucleicAcidOtherCodeMap{lc($NucleicAcidID)}) {
 354     $Code = $NucleicAcidOtherCodeMap{lc($NucleicAcidID)};
 355   }
 356   elsif (exists $NucleicAcidNameMap{lc($NucleicAcidID)}) {
 357     $Code = $NucleicAcidNameMap{lc($NucleicAcidID)};
 358   }
 359   return $Code;
 360 }
 361 
 362