1 package NucleicAcids; 2 # 3 # File: NucleicAcids.pm 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use Carp; 28 use Text::ParseWords; 29 use TextUtil; 30 use FileUtil; 31 32 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 33 34 @ISA = qw(Exporter); 35 @EXPORT = qw(); 36 @EXPORT_OK = qw(GetNucleicAcids GetNucleicAcidsByType GetNucleicAcidPropertiesData GetNucleicAcidPropertiesNames IsNucleicAcid IsNucleicAcidProperty IsNucleicAcidType); 37 38 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 39 40 # 41 # Load nucleic acids data... 42 # 43 my(%NucleicAcidDataMap, %NucleicAcidCodeMap, %NucleicAcidOtherCodeMap, %NucleicAcidNameMap, @NucleicAcidCodes, @NucleicAcidPropertyNames, %NucleicAcidPropertyNamesMap, %NucleicAcidTypesMap); 44 _LoadNucleicAcidsData(); 45 46 # 47 # Get a list of all known nucleic acids as one of these values: 48 # code or nucleic acid name... 49 # 50 sub GetNucleicAcids { 51 my($NameType, $Code, $Name, @NucleicAcidNames); 52 53 $NameType = 'Code'; 54 if (@_ >= 1) { 55 ($NameType) = @_; 56 } 57 58 # Collect names... 59 @NucleicAcidNames = (); 60 for $Code (@NucleicAcidCodes) { 61 NAME : { 62 if ($NameType =~ /^Name$/i) {$Name = $NucleicAcidDataMap{$Code}{Name}; last NAME; } 63 $Name = $Code; 64 } 65 push @NucleicAcidNames, $Name; 66 } 67 68 return (wantarray ? @NucleicAcidNames : \@NucleicAcidNames); 69 } 70 71 # 72 # Get a list of all known nucleic acids by one of these specified types: 73 # Nucleobase, Nucleoside, Deoxynucleoside, Nucleotide, Deoxynucleotide. Default: Nucleoside 74 # 75 sub GetNucleicAcidsByType { 76 my($NameType, $Type, $Code, $Name, @NucleicAcidNames); 77 78 $Type = 'Nucleoside'; 79 $NameType = 'Code'; 80 if (@_ == 2) { 81 ($Type, $NameType) = @_; 82 } 83 elsif (@_ == 1) { 84 ($Type) = @_; 85 } 86 87 # Collect names... 88 @NucleicAcidNames = (); 89 CODE: for $Code (@NucleicAcidCodes) { 90 if ($NucleicAcidDataMap{$Code}{Type} !~ /^$Type$/i ) { 91 next CODE; 92 } 93 NAME : { 94 if ($NameType =~ /^Name$/i) {$Name = $NucleicAcidDataMap{$Code}{Name}; last NAME; } 95 $Name = $Code; 96 } 97 push @NucleicAcidNames, $Name; 98 } 99 100 return (wantarray ? @NucleicAcidNames : \@NucleicAcidNames); 101 } 102 103 # 104 # Get all available properties data for an nucleic acid using any of these symbols: 105 # code, other code or name. 106 # 107 # A reference to a hash array is returned with keys and values representing property 108 # name and its values respectively. 109 # 110 sub GetNucleicAcidPropertiesData { 111 my($NucleicAcidID) = @_; 112 my($Code); 113 114 if ($Code = _ValidateNucleicAcidID($NucleicAcidID)) { 115 return \%{$NucleicAcidDataMap{$Code}}; 116 } 117 else { 118 return undef; 119 } 120 } 121 122 # 123 # Get names of all available nucleic acid properties. A reference to an array containing 124 # names of all available properties is returned. 125 # 126 sub GetNucleicAcidPropertiesNames { 127 my($Mode); 128 my($PropertyName, @PropertyNames); 129 130 $Mode = 'ByGroup'; 131 if (@_ == 1) { 132 ($Mode) = @_; 133 } 134 135 @PropertyNames = (); 136 if ($Mode =~ /^Alphabetical$/i) { 137 my($PropertyName); 138 # Code, OtherCodes and Name are always listed first... 139 push @PropertyNames, qw(Code OtherCodes Name); 140 for $PropertyName (sort keys %NucleicAcidPropertyNamesMap) { 141 if ($PropertyName !~ /^(Code|OtherCodes|Name)$/) { 142 push @PropertyNames, $PropertyName; 143 } 144 } 145 } 146 else { 147 push @PropertyNames, @NucleicAcidPropertyNames; 148 } 149 return (wantarray ? @PropertyNames : \@PropertyNames); 150 } 151 152 # 153 # Is it a known nucleic acid? Input is either a code or a name 154 # 155 sub IsNucleicAcid { 156 my($NucleicAcidID) = @_; 157 my($Status); 158 159 $Status = (_ValidateNucleicAcidID($NucleicAcidID)) ? 1 : 0; 160 161 return $Status; 162 } 163 164 # 165 # Is it an available nucleic acid property? 166 # 167 sub IsNucleicAcidProperty { 168 my($PropertyName) = @_; 169 my($Status); 170 171 $Status = (exists($NucleicAcidPropertyNamesMap{$PropertyName})) ? 1 : 0; 172 173 return $Status; 174 } 175 176 # 177 # Is it an available nucleic acid type? 178 # 179 sub IsNucleicAcidType { 180 my($Type) = @_; 181 my($Status); 182 183 $Status = (exists($NucleicAcidTypesMap{lc($Type)})) ? 1 : 0; 184 185 return $Status; 186 } 187 188 # 189 # Implents GetNucleicAcid<PropertyName> for a valid proprty name. 190 # 191 sub AUTOLOAD { 192 my($NucleicAcidID) = @_; 193 my($FunctionName, $PropertyName, $PropertyValue, $Code); 194 195 $PropertyValue = undef; 196 197 use vars qw($AUTOLOAD); 198 $FunctionName = $AUTOLOAD; 199 $FunctionName =~ s/.*:://; 200 201 # Only Get<PropertyName> functions are supported... 202 if ($FunctionName !~ /^Get/) { 203 croak "Error: Function, NucleicAcid::$FunctionName, is not supported by AUTOLOAD in NucleicAcid module: Only Get<PropertyName> functions are implemented..."; 204 } 205 206 $PropertyName = $FunctionName; 207 $PropertyName =~ s/^GetNucleicAcid//; 208 if (!exists $NucleicAcidPropertyNamesMap{$PropertyName}) { 209 croak "Error: Function, NucleicAcid::$FunctionName, is not supported by AUTOLOAD in NucleicAcid module: Unknown nucleic acid property name, $PropertyName, specified..."; 210 } 211 212 if (!($Code = _ValidateNucleicAcidID($NucleicAcidID))) { 213 return undef; 214 } 215 $PropertyValue = $NucleicAcidDataMap{$Code}{$PropertyName}; 216 return $PropertyValue; 217 } 218 219 # 220 # Load NucleicAcidsData.csv files from <MayaChemTools>/lib directory... 221 # 222 sub _LoadNucleicAcidsData { 223 my($NucleicAcidsDataFile, $MayaChemToolsLibDir); 224 225 $MayaChemToolsLibDir = GetMayaChemToolsLibDirName(); 226 227 $NucleicAcidsDataFile = "$MayaChemToolsLibDir" . "/data/NucleicAcidsData.csv"; 228 229 if (! -e "$NucleicAcidsDataFile") { 230 croak "Error: MayaChemTools package file, $NucleicAcidsDataFile, is missing: Possible installation problems..."; 231 } 232 233 _LoadData($NucleicAcidsDataFile); 234 } 235 236 # 237 # Load NucleicAcidsData.csv file from <MayaChemTools>/lib directory... 238 # 239 sub _LoadData { 240 my($NucleicAcidsDataFile) = @_; 241 242 %NucleicAcidDataMap = (); 243 @NucleicAcidCodes = (); 244 @NucleicAcidPropertyNames = (); 245 %NucleicAcidPropertyNamesMap = (); 246 %NucleicAcidCodeMap = (); 247 %NucleicAcidOtherCodeMap = (); 248 %NucleicAcidNameMap = (); 249 %NucleicAcidTypesMap = (); 250 251 # Load property data for all nucleic acids... 252 # 253 # File Format: 254 # "Code","OtherCodes","BasePair","Name","Type","ChemicalFormula","ChemicalFormulaAtpH7.5","MolecularWeight","ExactMass","ElementalComposition" 255 # 256 my($Code, $OtherCodes, $NucleicAcidName, $Line, $NumOfCols, $InDelim, $Index, $Name, $Value, $Units, @LineWords, @ColLabels); 257 258 $InDelim = "\,"; 259 open NUCLEICACIDSDATAFILE, "$NucleicAcidsDataFile" or croak "Couldn't open $NucleicAcidsDataFile: $! ..."; 260 261 # Skip lines up to column labels... 262 LINE: while ($Line = GetTextLine(\*NUCLEICACIDSDATAFILE)) { 263 if ($Line !~ /^#/) { 264 last LINE; 265 } 266 } 267 @ColLabels= quotewords($InDelim, 0, $Line); 268 $NumOfCols = @ColLabels; 269 270 # Extract property names from column labels... 271 @NucleicAcidPropertyNames = (); 272 for $Index (0 .. $#ColLabels) { 273 $Name = $ColLabels[$Index]; 274 push @NucleicAcidPropertyNames, $Name; 275 276 # Store property names... 277 $NucleicAcidPropertyNamesMap{$Name} = $Name; 278 } 279 280 # Process nucleic acid data... 281 LINE: while ($Line = GetTextLine(\*NUCLEICACIDSDATAFILE)) { 282 if ($Line =~ /^#/) { 283 next LINE; 284 } 285 @LineWords = (); 286 @LineWords = quotewords($InDelim, 0, $Line); 287 if (@LineWords != $NumOfCols) { 288 croak "Error: The number of data fields, @LineWords, in $NucleicAcidsDataFile must be $NumOfCols.\nLine: $Line..."; 289 } 290 $Code = $LineWords[0]; $OtherCodes = $LineWords[1]; $NucleicAcidName = $LineWords[3]; 291 if (exists $NucleicAcidDataMap{$Code}) { 292 carp "Warning: Ignoring data for nucleic acid $Code: It has already been loaded.\nLine: $Line...."; 293 next LINE; 294 } 295 296 # Store all the values... 297 push @NucleicAcidCodes, $Code; 298 %{$NucleicAcidDataMap{$Code}} = (); 299 for $Index (0 .. $#LineWords) { 300 $Name = $NucleicAcidPropertyNames[$Index]; 301 $Value = $LineWords[$Index]; 302 $NucleicAcidDataMap{$Code}{$Name} = $Value; 303 } 304 } 305 close NUCLEICACIDSDATAFILE; 306 307 # Setup one letter and nucleic acid name maps... 308 _SetupNucleicAcidIDMap(); 309 } 310 311 # 312 # Setup lowercase other codes and name maps pointing 313 # to code as show in data file. 314 # 315 sub _SetupNucleicAcidIDMap { 316 my($Code, @OtherCodes, $OtherCode, $NucleicAcidName, $NucleicAcidType); 317 318 %NucleicAcidCodeMap = (); 319 %NucleicAcidOtherCodeMap = (); 320 %NucleicAcidNameMap = (); 321 %NucleicAcidTypesMap = (); 322 323 for $Code (keys %NucleicAcidDataMap) { 324 $NucleicAcidCodeMap{lc($Code)} = $Code; 325 326 $NucleicAcidName = $NucleicAcidDataMap{$Code}{Name}; 327 $NucleicAcidNameMap{lc($NucleicAcidName)} = $Code; 328 329 $NucleicAcidType = $NucleicAcidDataMap{$Code}{Type}; 330 if (! exists $NucleicAcidTypesMap{$NucleicAcidType}) { 331 $NucleicAcidTypesMap{lc($NucleicAcidType)} = $NucleicAcidType; 332 } 333 334 @OtherCodes = split /\,/, $NucleicAcidDataMap{$Code}{OtherCodes}; 335 OTHERCODE: for $OtherCode (@OtherCodes) { 336 if (!$OtherCode) { 337 next OTHERCODE; 338 } 339 $OtherCode = RemoveLeadingAndTrailingWhiteSpaces($OtherCode); 340 $NucleicAcidOtherCodeMap{lc($OtherCode)} = $Code; 341 } 342 } 343 } 344 345 # Validate Nucleic acid ID... 346 sub _ValidateNucleicAcidID { 347 my($NucleicAcidID) = @_; 348 my($Code) = undef; 349 350 if (exists $NucleicAcidCodeMap{lc($NucleicAcidID)}) { 351 $Code = $NucleicAcidCodeMap{lc($NucleicAcidID)}; 352 } 353 elsif (exists $NucleicAcidOtherCodeMap{lc($NucleicAcidID)}) { 354 $Code = $NucleicAcidOtherCodeMap{lc($NucleicAcidID)}; 355 } 356 elsif (exists $NucleicAcidNameMap{lc($NucleicAcidID)}) { 357 $Code = $NucleicAcidNameMap{lc($NucleicAcidID)}; 358 } 359 return $Code; 360 } 361 362