MayaChemTools

   1 package Fingerprints::FingerprintsFileUtil;
   2 #
   3 # File: FingerprintsFileUtil.pm
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use Exporter;
  28 use Carp;
  29 use TextUtil ();
  30 use FileUtil ();
  31 use FileIO::FingerprintsSDFileIO;
  32 use FileIO::FingerprintsTextFileIO;
  33 use FileIO::FingerprintsFPFileIO;
  34 
  35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  36 
  37 @ISA = qw(Exporter);
  38 @EXPORT = qw();
  39 @EXPORT_OK = qw(GetFingerprintsFileType ReadAndProcessFingerpritsData  NewFingerprintsFileIO);
  40 
  41 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  42 
  43 # Generate new FingerprintsFileIO object for a SD, FP or Text fingerprints file specified using file name
  44 # along other appropriate parameters...
  45 #
  46 sub NewFingerprintsFileIO {
  47   my(%FingerprintsFileIOParams) = @_;
  48   my($FingerprintsFileIO, $FileType);
  49 
  50   if (!(exists($FingerprintsFileIOParams{Name}) && TextUtil::IsNotEmpty($FingerprintsFileIOParams{Name}))) {
  51     carp "Warning: Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO: Can't create new FingerprintsFileIO object: File name is not specified...\n";
  52     return undef;
  53   }
  54 
  55   if (!(exists($FingerprintsFileIOParams{Mode}) && TextUtil::IsNotEmpty($FingerprintsFileIOParams{Mode}))) {
  56     carp "Warning: Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO: Can't create new FingerprintsFileIO object: File mode is not specified...\n";
  57     return undef;
  58   }
  59 
  60   $FileType = GetFingerprintsFileType($FingerprintsFileIOParams{Name});
  61   if (TextUtil::IsEmpty($FileType)) {
  62     carp "Warning: Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO: Can't create new FingerprintsFileIO object: File type is not specified...\n";
  63     return undef;
  64   }
  65 
  66   # Generate fingerprints IO object...
  67   FILETYPE: {
  68     if ($FileType =~ /^SD$/i) {
  69       $FingerprintsFileIO = new FileIO::FingerprintsSDFileIO(%FingerprintsFileIOParams);
  70       last FILETYPE;
  71     }
  72     if ($FileType =~ /^FP$/i) {
  73       $FingerprintsFileIO = new FileIO::FingerprintsFPFileIO(%FingerprintsFileIOParams);
  74       last FILETYPE;
  75     }
  76     if ($FileType =~ /^Text$/i) {
  77       $FingerprintsFileIO = new FileIO::FingerprintsTextFileIO(%FingerprintsFileIOParams);
  78       last FILETYPE;
  79     }
  80     $FingerprintsFileIO = undef;
  81     carp "Warning: Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO: Fingerprints file type, $FileType, is not valid. Supported file types: SD, FP or Text\n";
  82   }
  83 
  84   return $FingerprintsFileIO;
  85 }
  86 
  87 # Get fingerpritns file type from fingerprints file name...
  88 #
  89 sub GetFingerprintsFileType {
  90   my($FileName) = @_;
  91   my($FileType);
  92 
  93   $FileType = '';
  94   FILETYPE: {
  95     if (FileUtil::CheckFileType($FileName, "sdf sd")) {
  96       $FileType = 'SD';
  97       last FILETYPE;
  98     }
  99     if (FileUtil::CheckFileType($FileName, "fpf fp")) {
 100       $FileType = 'FP';
 101       last FILETYPE;
 102     }
 103     if (FileUtil::CheckFileType($FileName, "csv tsv")) {
 104       $FileType = 'Text';
 105       last FILETYPE;
 106     }
 107     $FileType = '';
 108     carp "Warning: Fingerprints::FingerprintsFileUtil::GetFingerprintsFileType: Can't determine fingerprints file type for $FileName: It's not a fingerprints file...\n";
 109   }
 110 
 111   return $FileType;
 112 }
 113 
 114 
 115 # Process fingerprints bit-vector and vector string data in a file using FingerprintsFileIO
 116 # object and return a references to arrays of CompoundIDs and FingerprintsObjects...
 117 #
 118 # Note:
 119 #  . The file open and close is automatically performed during processing.
 120 #
 121 sub ReadAndProcessFingerpritsData {
 122   my($FingerprintsFileIO, $CheckCompoundIDs) = @_;
 123   my($CompoundID, $FingerprintsCount, $IgnoredFingerprintsCount, @CompundIDs, @FingerprintsObjects, %UniqueCompoundIDs);
 124 
 125   if (!$FingerprintsFileIO) {
 126     return (undef, undef);
 127   }
 128   $CheckCompoundIDs = defined $CheckCompoundIDs ? $CheckCompoundIDs : 0;
 129 
 130   print "\nReading and processing fingerprints data...\n";
 131 
 132   ($FingerprintsCount, $IgnoredFingerprintsCount) = (0) x 3;
 133 
 134   @CompundIDs = ();
 135   @FingerprintsObjects = ();
 136 
 137   %UniqueCompoundIDs = ();
 138 
 139   # Check and open file for reading...
 140   if (!$FingerprintsFileIO->GetStatus()) {
 141     $FingerprintsFileIO->Open();
 142   }
 143 
 144   FINGERPRINTS: while ($FingerprintsFileIO->Read()) {
 145     $FingerprintsCount++;
 146 
 147     if (!$FingerprintsFileIO->IsFingerprintsDataValid()) {
 148       $IgnoredFingerprintsCount++;
 149       next FINGERPRINTS;
 150     }
 151 
 152     if ($CheckCompoundIDs) {
 153       $CompoundID = $FingerprintsFileIO->GetCompoundID();
 154       if (exists $UniqueCompoundIDs{$CompoundID}) {
 155         warn "Warning: Ignoring fingerprints data for compound ID $CompoundID: Multiple entries for compound ID in fingerprints file.\n";
 156         $IgnoredFingerprintsCount++;
 157         next FINGERPRINTS;
 158       }
 159       $UniqueCompoundIDs{$CompoundID} = $CompoundID;
 160     }
 161 
 162     push @FingerprintsObjects, $FingerprintsFileIO->GetFingerprints();
 163     push @CompundIDs, $FingerprintsFileIO->GetCompoundID();
 164   }
 165   $FingerprintsFileIO->Close();
 166 
 167   print "Number of fingerprints data entries: $FingerprintsCount\n";
 168   print "Number of fingerprints date entries processed successfully: ", ($FingerprintsCount - $IgnoredFingerprintsCount)  , "\n";
 169   print "Number of fingerprints data entries ignored due to missing/invalid data: $IgnoredFingerprintsCount\n\n";
 170 
 171   return (\@CompundIDs, \@FingerprintsObjects);
 172 }
 173 
 174