1 package Fingerprints::FingerprintsFileUtil; 2 # 3 # File: FingerprintsFileUtil.pm 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use Exporter; 28 use Carp; 29 use TextUtil (); 30 use FileUtil (); 31 use FileIO::FingerprintsSDFileIO; 32 use FileIO::FingerprintsTextFileIO; 33 use FileIO::FingerprintsFPFileIO; 34 35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 36 37 @ISA = qw(Exporter); 38 @EXPORT = qw(); 39 @EXPORT_OK = qw(GetFingerprintsFileType ReadAndProcessFingerpritsData NewFingerprintsFileIO); 40 41 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 42 43 # Generate new FingerprintsFileIO object for a SD, FP or Text fingerprints file specified using file name 44 # along other appropriate parameters... 45 # 46 sub NewFingerprintsFileIO { 47 my(%FingerprintsFileIOParams) = @_; 48 my($FingerprintsFileIO, $FileType); 49 50 if (!(exists($FingerprintsFileIOParams{Name}) && TextUtil::IsNotEmpty($FingerprintsFileIOParams{Name}))) { 51 carp "Warning: Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO: Can't create new FingerprintsFileIO object: File name is not specified...\n"; 52 return undef; 53 } 54 55 if (!(exists($FingerprintsFileIOParams{Mode}) && TextUtil::IsNotEmpty($FingerprintsFileIOParams{Mode}))) { 56 carp "Warning: Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO: Can't create new FingerprintsFileIO object: File mode is not specified...\n"; 57 return undef; 58 } 59 60 $FileType = GetFingerprintsFileType($FingerprintsFileIOParams{Name}); 61 if (TextUtil::IsEmpty($FileType)) { 62 carp "Warning: Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO: Can't create new FingerprintsFileIO object: File type is not specified...\n"; 63 return undef; 64 } 65 66 # Generate fingerprints IO object... 67 FILETYPE: { 68 if ($FileType =~ /^SD$/i) { 69 $FingerprintsFileIO = new FileIO::FingerprintsSDFileIO(%FingerprintsFileIOParams); 70 last FILETYPE; 71 } 72 if ($FileType =~ /^FP$/i) { 73 $FingerprintsFileIO = new FileIO::FingerprintsFPFileIO(%FingerprintsFileIOParams); 74 last FILETYPE; 75 } 76 if ($FileType =~ /^Text$/i) { 77 $FingerprintsFileIO = new FileIO::FingerprintsTextFileIO(%FingerprintsFileIOParams); 78 last FILETYPE; 79 } 80 $FingerprintsFileIO = undef; 81 carp "Warning: Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO: Fingerprints file type, $FileType, is not valid. Supported file types: SD, FP or Text\n"; 82 } 83 84 return $FingerprintsFileIO; 85 } 86 87 # Get fingerpritns file type from fingerprints file name... 88 # 89 sub GetFingerprintsFileType { 90 my($FileName) = @_; 91 my($FileType); 92 93 $FileType = ''; 94 FILETYPE: { 95 if (FileUtil::CheckFileType($FileName, "sdf sd")) { 96 $FileType = 'SD'; 97 last FILETYPE; 98 } 99 if (FileUtil::CheckFileType($FileName, "fpf fp")) { 100 $FileType = 'FP'; 101 last FILETYPE; 102 } 103 if (FileUtil::CheckFileType($FileName, "csv tsv")) { 104 $FileType = 'Text'; 105 last FILETYPE; 106 } 107 $FileType = ''; 108 carp "Warning: Fingerprints::FingerprintsFileUtil::GetFingerprintsFileType: Can't determine fingerprints file type for $FileName: It's not a fingerprints file...\n"; 109 } 110 111 return $FileType; 112 } 113 114 115 # Process fingerprints bit-vector and vector string data in a file using FingerprintsFileIO 116 # object and return a references to arrays of CompoundIDs and FingerprintsObjects... 117 # 118 # Note: 119 # . The file open and close is automatically performed during processing. 120 # 121 sub ReadAndProcessFingerpritsData { 122 my($FingerprintsFileIO, $CheckCompoundIDs) = @_; 123 my($CompoundID, $FingerprintsCount, $IgnoredFingerprintsCount, @CompundIDs, @FingerprintsObjects, %UniqueCompoundIDs); 124 125 if (!$FingerprintsFileIO) { 126 return (undef, undef); 127 } 128 $CheckCompoundIDs = defined $CheckCompoundIDs ? $CheckCompoundIDs : 0; 129 130 print "\nReading and processing fingerprints data...\n"; 131 132 ($FingerprintsCount, $IgnoredFingerprintsCount) = (0) x 3; 133 134 @CompundIDs = (); 135 @FingerprintsObjects = (); 136 137 %UniqueCompoundIDs = (); 138 139 # Check and open file for reading... 140 if (!$FingerprintsFileIO->GetStatus()) { 141 $FingerprintsFileIO->Open(); 142 } 143 144 FINGERPRINTS: while ($FingerprintsFileIO->Read()) { 145 $FingerprintsCount++; 146 147 if (!$FingerprintsFileIO->IsFingerprintsDataValid()) { 148 $IgnoredFingerprintsCount++; 149 next FINGERPRINTS; 150 } 151 152 if ($CheckCompoundIDs) { 153 $CompoundID = $FingerprintsFileIO->GetCompoundID(); 154 if (exists $UniqueCompoundIDs{$CompoundID}) { 155 warn "Warning: Ignoring fingerprints data for compound ID $CompoundID: Multiple entries for compound ID in fingerprints file.\n"; 156 $IgnoredFingerprintsCount++; 157 next FINGERPRINTS; 158 } 159 $UniqueCompoundIDs{$CompoundID} = $CompoundID; 160 } 161 162 push @FingerprintsObjects, $FingerprintsFileIO->GetFingerprints(); 163 push @CompundIDs, $FingerprintsFileIO->GetCompoundID(); 164 } 165 $FingerprintsFileIO->Close(); 166 167 print "Number of fingerprints data entries: $FingerprintsCount\n"; 168 print "Number of fingerprints date entries processed successfully: ", ($FingerprintsCount - $IgnoredFingerprintsCount) , "\n"; 169 print "Number of fingerprints data entries ignored due to missing/invalid data: $IgnoredFingerprintsCount\n\n"; 170 171 return (\@CompundIDs, \@FingerprintsObjects); 172 } 173 174