MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: ExtractFromSDFiles.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use SDFileUtil;
  33 use FileUtil;
  34 use TextUtil;
  35 
  36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  37 
  38 # Autoflush STDOUT
  39 $| = 1;
  40 
  41 # Starting message...
  42 $ScriptName = basename($0);
  43 print "\n$ScriptName:Starting...\n\n";
  44 $StartTime = new Benchmark;
  45 
  46 # Get the options and setup script...
  47 SetupScriptUsage();
  48 if ($Options{help} || @ARGV < 1) {
  49   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  50 }
  51 
  52 my(@SDFilesList);
  53 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  54 
  55 # Process options...
  56 print "Processing options...\n";
  57 my(%OptionsInfo);
  58 ProcessOptions();
  59 
  60 # Collect information about SD files...
  61 print "Checking input SD file(s)...\n";
  62 my(%SDFilesInfo);
  63 RetrieveSDFilesInfo();
  64 
  65 # Generate output files...
  66 my($FileIndex);
  67 if (@SDFilesList > 1) {
  68   print "\nProcessing SD files...\n";
  69 }
  70 for $FileIndex (0 .. $#SDFilesList) {
  71   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  72     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  73     ExtractFromSDFile($FileIndex);
  74   }
  75 }
  76 print "\n$ScriptName:Done...\n\n";
  77 
  78 $EndTime = new Benchmark;
  79 $TotalTime = timediff ($EndTime, $StartTime);
  80 print "Total time: ", timestr($TotalTime), "\n";
  81 
  82 ###############################################################################
  83 
  84 # Extract data from a SD file...
  85 sub ExtractFromSDFile {
  86   my($FileIndex) = @_;
  87 
  88   OpenInputAndOutputFiles($FileIndex);
  89 
  90   MODE: {
  91     if ($OptionsInfo{Mode} =~ /^AllDataFields$/i) {
  92       ExtractAllDataFields($FileIndex);
  93       last MODE;
  94     }
  95     if ($OptionsInfo{Mode} =~ /^CommonDataFields$/i) {
  96       ExtractCommonDataFields($FileIndex);
  97       last MODE;
  98     }
  99     if ($OptionsInfo{Mode} =~ /^DataFields$/i) {
 100       ExtractDataFields($FileIndex);
 101       last MODE;
 102     }
 103     if ($OptionsInfo{Mode} =~ /^(DataFieldByList|DatafieldUniqueByList)$/i) {
 104       ExtractDataFieldByList($FileIndex);
 105       last MODE;
 106     }
 107     if ($OptionsInfo{Mode} =~ /^DataFieldNotByList$/i) {
 108       ExtractDataFieldNotByList($FileIndex);
 109       last MODE;
 110     }
 111     if ($OptionsInfo{Mode} =~ /^DataFieldsByValue$/i) {
 112       ExtractDataFieldsByValue($FileIndex);
 113       last MODE;
 114     }
 115     if ($OptionsInfo{Mode} =~ /^DataFieldsByRegex$/i) {
 116       ExtractDataFieldsByRegex($FileIndex);
 117       last MODE;
 118     }
 119     if ($OptionsInfo{Mode} =~ /^RandomCmpds$/i) {
 120       ExtractRandomCompounds($FileIndex);
 121       last MODE;
 122     }
 123     if ($OptionsInfo{Mode} =~ /^MolNames$/i) {
 124       ExtractMolNames($FileIndex);
 125       last MODE;
 126     }
 127     if ($OptionsInfo{Mode} =~ /^RecordNum$/i) {
 128       ExtractRecordNum($FileIndex);
 129       last MODE;
 130     }
 131     if ($OptionsInfo{Mode} =~ /^RecordNums$/i) {
 132       ExtractRecordNums($FileIndex);
 133       last MODE;
 134     }
 135     if ($OptionsInfo{Mode} =~ /^RecordRange$/i) {
 136       ExtractRecordRange($FileIndex);
 137       last MODE;
 138     }
 139     if ($OptionsInfo{Mode} =~ /^2DCmpdRecords$/i) {
 140       Extract2DCmpdRecords($FileIndex);
 141       last MODE;
 142     }
 143     if ($OptionsInfo{Mode} =~ /^3DCmpdRecords$/i) {
 144       Extract3DCmpdRecords($FileIndex);
 145       last MODE;
 146     }
 147     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
 148   }
 149 
 150   CloseInputAndOutputFiles();
 151 }
 152 
 153 # Extract all data fields...
 154 sub ExtractAllDataFields {
 155   my($FileIndex) = @_;
 156   my(@CmpdLines);
 157 
 158   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 159   WriteTextFileColLabels();
 160 
 161   while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 162     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 163     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 164 
 165     SetupDataValues();
 166     WriteTextFileCmpdData();
 167     WriteSDFileCmpdData();
 168   }
 169 }
 170 
 171 # Extract common data fields...
 172 sub ExtractCommonDataFields {
 173   my($FileIndex) = @_;
 174   my(@CmpdLines);
 175 
 176   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{CommonDataFieldLabels}[$FileIndex]};
 177   WriteTextFileColLabels();
 178 
 179   while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 180     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 181     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 182 
 183     SetupDataValues();
 184     WriteTextFileCmpdData();
 185     WriteSDFileCmpdData();
 186   }
 187 }
 188 
 189 # Extract specified data fields...
 190 sub ExtractDataFields {
 191   my($FileIndex) = @_;
 192   my(@CmpdLines);
 193 
 194   @{$SDFilesInfo{DataLabels}} = @{$OptionsInfo{SpecifiedDataFieldLabels}};
 195   WriteTextFileColLabels();
 196 
 197   while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 198     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 199     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 200 
 201     SetupDataValues();
 202     WriteTextFileCmpdData();
 203     WriteSDFileCmpdData();
 204   }
 205 }
 206 
 207 # Extract data fields using a list...
 208 sub ExtractDataFieldByList {
 209   my($FileIndex) = @_;
 210   my($CmpdNum, $Value, $SpecifiedDataFieldValuesFoundCount, $CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines);
 211 
 212   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 213   WriteTextFileColLabels();
 214 
 215   for $Value (keys %{$OptionsInfo{SpecifiedDataFieldValues}}) {
 216     $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound";
 217   }
 218   $SpecifiedDataFieldValuesFoundCount = 0;
 219   $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel};
 220 
 221   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 222     $CmpdNum++;
 223 
 224     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 225     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 226 
 227     if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) {
 228       next CMPDSTRING;
 229     }
 230 
 231     SetupDataValues();
 232 
 233     $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel};
 234     $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel};
 235 
 236     if (exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) {
 237       if ($SpecifiedDataFieldValuesFoundCount < $OptionsInfo{SpecifiedDataFieldValuesCount}) {
 238         if ($OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} eq "NotFound") {
 239           $SpecifiedDataFieldValuesFoundCount++;
 240           $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} = "Found";
 241           if ($OptionsInfo{Mode} =~ /^DataFieldUniqueByList$/i) {
 242             WriteSDFileCmpdString();
 243             WriteTextFileCmpdData();
 244           }
 245         }
 246         if ($OptionsInfo{Mode} =~ /^DataFieldByList$/i) {
 247           WriteSDFileCmpdString();
 248           WriteTextFileCmpdData();
 249         }
 250       }
 251       if ($SpecifiedDataFieldValuesFoundCount >= $OptionsInfo{SpecifiedDataFieldValuesCount}) {
 252         last CMPDSTRING;
 253       }
 254     }
 255   }
 256 }
 257 
 258 # Extract data field whose values are not on the specified list...
 259 sub ExtractDataFieldNotByList {
 260   my($FileIndex) = @_;
 261   my($CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines);
 262 
 263   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 264   WriteTextFileColLabels();
 265 
 266   $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel};
 267 
 268   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 269     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 270     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 271 
 272     if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) {
 273       next CMPDSTRING;
 274     }
 275 
 276     SetupDataValues();
 277 
 278     $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel};
 279 
 280     # Make sure the current value is not empty and is not only specified list of values...
 281     if (IsEmpty($CurrentValue) || exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) {
 282       next CMPDSTRING;
 283     }
 284 
 285     WriteSDFileCmpdString();
 286     WriteTextFileCmpdData();
 287   }
 288 }
 289 
 290 # Extract data fields by value...
 291 sub ExtractDataFieldsByValue {
 292   my($FileIndex) = @_;
 293   my($Label, $CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing, @CmpdLines);
 294 
 295   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 296   WriteTextFileColLabels();
 297 
 298   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 299     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 300     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 301 
 302     SetupDataValues();
 303     $ViolationCount = 0;
 304 
 305     for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) {
 306       if (exists $SDFilesInfo{DataFieldValues}{$Label}) {
 307         $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label};
 308         $SpecifiedCriterion = $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label};
 309         $SpecifiedValue = $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label};
 310 
 311         if ($OptionsInfo{NumericalComparison}) {
 312           CRITERION: {
 313               if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue != $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 314               if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue > $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 315               if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue < $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 316               $Nothing = 1;
 317             }
 318         }
 319         else {
 320           CRITERION: {
 321               if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 322               if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 323               if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
 324               $Nothing = 1;
 325             }
 326         }
 327       }
 328     }
 329     if ($ViolationCount <= $OptionsInfo{Violations}) {
 330       WriteSDFileCmpdString();
 331       WriteTextFileCmpdData();
 332     }
 333   }
 334 }
 335 
 336 # Extract data fields by value using regular expression match...
 337 sub ExtractDataFieldsByRegex {
 338   my($FileIndex) = @_;
 339   my($Label, $CurrentValue, $SpecifiedRegexCriterion, $SpecifiedRegex, $ViolationCount, $Nothing, @CmpdLines);
 340 
 341   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 342   WriteTextFileColLabels();
 343 
 344   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 345     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 346     %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 347 
 348     SetupDataValues();
 349     $ViolationCount = 0;
 350 
 351     for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) {
 352       if (exists $SDFilesInfo{DataFieldValues}{$Label}) {
 353         $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label};
 354            $SpecifiedRegexCriterion = $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label};
 355            $SpecifiedRegex = $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label};
 356 
 357         if ($OptionsInfo{RegexIgnoreCase}) {
 358           CRITERION: {
 359                  if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } }
 360                  if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/i) {  $ViolationCount++; last CRITERION; } }
 361               $Nothing = 1;
 362             }
 363         }
 364         else {
 365           CRITERION: {
 366                  if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } }
 367                  if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/) {  $ViolationCount++; last CRITERION; } }
 368               $Nothing = 1;
 369             }
 370         }
 371       }
 372     }
 373     if ($ViolationCount <= $OptionsInfo{Violations}) {
 374       WriteSDFileCmpdString();
 375       WriteTextFileCmpdData();
 376     }
 377   }
 378 }
 379 
 380 # Extract random compounds...
 381 sub ExtractRandomCompounds {
 382   my($FileIndex) = @_;
 383   my($CmpdNum, $CmpdCount, $RandomCycleCount, $RandomIndex, @CmpdLines, %RandomCmpdIndexMap);
 384 
 385   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 386   WriteTextFileColLabels();
 387 
 388   $CmpdCount = $SDFilesInfo{CmpdCount}[$FileIndex];
 389   srand($OptionsInfo{Seed});
 390   $RandomCycleCount = 0;
 391 
 392   %RandomCmpdIndexMap = ();
 393   while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $OptionsInfo{NumOfCmpds}) {
 394     $RandomCycleCount++;
 395     $RandomIndex = int (rand $CmpdCount) + 1;
 396     $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex;
 397   }
 398 
 399   $CmpdNum = 0;
 400   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 401     $CmpdNum++;
 402     if (!exists $RandomCmpdIndexMap{$CmpdNum}) {
 403       next CMPDSTRING;
 404     }
 405 
 406     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 407 
 408     WriteSDFileCmpdString();
 409 
 410     if ($OptionsInfo{OutputTextFile}) {
 411       %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 412       SetupDataValues();
 413       WriteTextFileCmpdData();
 414     }
 415   }
 416 }
 417 
 418 # Extract mol names...
 419 sub ExtractMolNames {
 420   my($FileIndex) = @_;
 421   my($MolName, $NewTextFileRef, @CmpdLines);
 422 
 423   push @{$SDFilesInfo{DataLabels}}, "MolName";
 424   WriteTextFileColLabels();
 425 
 426   $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
 427   while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 428     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 429     $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OptionsInfo{OutQuote});
 430     print $NewTextFileRef "$MolName\n";
 431   }
 432 }
 433 
 434 # Extract a specific compound record...
 435 sub ExtractRecordNum {
 436   my($FileIndex) = @_;
 437   my($CmpdNum, @CmpdLines);
 438 
 439   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 440   WriteTextFileColLabels();
 441 
 442   $CmpdNum = 0;
 443 
 444   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 445     $CmpdNum++;
 446     if ($CmpdNum != $OptionsInfo{RecordNum}) {
 447       next CMPDSTRING;
 448     }
 449 
 450     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 451     WriteSDFileCmpdString();
 452 
 453     if ($OptionsInfo{OutputTextFile}) {
 454       %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 455       SetupDataValues();
 456       WriteTextFileCmpdData();
 457     }
 458     last CMPDSTRING;
 459   }
 460 }
 461 
 462 # Extract a specific compound records...
 463 sub ExtractRecordNums {
 464   my($FileIndex) = @_;
 465   my($CmpdNum, $CmpdCount, @CmpdLines);
 466 
 467   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 468   WriteTextFileColLabels();
 469 
 470   $CmpdNum = 0;
 471   $CmpdCount = 0;
 472 
 473   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 474     $CmpdNum++;
 475 
 476     if (exists $OptionsInfo{RecordNums}{$CmpdNum}) {
 477       $CmpdCount++;
 478       @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 479 
 480       WriteSDFileCmpdString();
 481 
 482       if ($OptionsInfo{OutputTextFile}) {
 483         %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 484         SetupDataValues();
 485         WriteTextFileCmpdData();
 486       }
 487     }
 488     elsif ($CmpdNum > $OptionsInfo{RecordNumsMax} || $CmpdCount >= $OptionsInfo{RecordNumsCount}) {
 489       last CMPDSTRING;
 490     }
 491   }
 492 }
 493 
 494 
 495 # Extract compounds in a specific record range...
 496 sub ExtractRecordRange {
 497   my($FileIndex) = @_;
 498   my($CmpdNum, @CmpdLines);
 499 
 500   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 501   WriteTextFileColLabels();
 502 
 503   $CmpdNum = 0;
 504   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 505     $CmpdNum++;
 506 
 507     if ($CmpdNum >= $OptionsInfo{StartRecordNum} && $CmpdNum <= $OptionsInfo{EndRecordNum}) {
 508       @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 509 
 510       WriteSDFileCmpdString();
 511 
 512       if ($OptionsInfo{OutputTextFile}) {
 513         %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 514         SetupDataValues();
 515         WriteTextFileCmpdData();
 516       }
 517     }
 518     elsif ($CmpdNum > $OptionsInfo{EndRecordNum}) {
 519       last CMPDSTRING;
 520     }
 521   }
 522 }
 523 
 524 # Extract 2D compound records...
 525 sub Extract2DCmpdRecords {
 526   my($FileIndex) = @_;
 527   my(@CmpdLines);
 528 
 529   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 530   WriteTextFileColLabels();
 531 
 532 
 533   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 534     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 535     if (!IsCmpd2D(\@CmpdLines)) {
 536       next CMPDSTRING;
 537     }
 538 
 539     WriteSDFileCmpdString();
 540 
 541     if ($OptionsInfo{OutputTextFile}) {
 542       %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 543       SetupDataValues();
 544       WriteTextFileCmpdData();
 545     }
 546   }
 547 }
 548 
 549 # Extract 3D compound records...
 550 sub Extract3DCmpdRecords {
 551   my($FileIndex) = @_;
 552   my(@CmpdLines);
 553 
 554   @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
 555   WriteTextFileColLabels();
 556 
 557 
 558   CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
 559     @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
 560     if (!IsCmpd3D(\@CmpdLines)) {
 561       next CMPDSTRING;
 562     }
 563 
 564     WriteSDFileCmpdString();
 565 
 566     if ($OptionsInfo{OutputTextFile}) {
 567       %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 568       SetupDataValues();
 569       WriteTextFileCmpdData();
 570     }
 571   }
 572 }
 573 
 574 
 575 # Open input and output files...
 576 sub OpenInputAndOutputFiles {
 577   my($FileIndex) = @_;
 578 
 579   $SDFilesInfo{NewTextFileRef} = undef;
 580   $SDFilesInfo{NewSDFileRef} = undef;
 581 
 582   if ($OptionsInfo{OutputTextFile} && $OptionsInfo{OutputSDFile}) {
 583     print "Generating files $SDFilesInfo{NewSDFileName}[$FileIndex] and $SDFilesInfo{NewTextFileName}[$FileIndex]...\n";
 584   }
 585   elsif ($OptionsInfo{OutputSDFile}) {
 586     print "Generating file $SDFilesInfo{NewSDFileName}[$FileIndex]...\n";
 587   }
 588   else {
 589     print "Generating file $SDFilesInfo{NewTextFileName}[$FileIndex]...\n";
 590   }
 591 
 592   if ($OptionsInfo{OutputSDFile}) {
 593     open NEWSDFILE, ">$SDFilesInfo{NewSDFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewSDFileName}[$FileIndex]: $! \n";
 594     $SDFilesInfo{NewSDFileRef} = \*NEWSDFILE;
 595   }
 596   if ($OptionsInfo{OutputTextFile}) {
 597     open NEWTEXTFILE, ">$SDFilesInfo{NewTextFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewTextFileName}[$FileIndex]: $! \n";
 598     $SDFilesInfo{NewTextFileRef} = \*NEWTEXTFILE;
 599   }
 600 
 601   open SDFILE, "$SDFilesList[$FileIndex]" or die "Error: Couldn't open $SDFilesList[$FileIndex]: $! \n";
 602   $SDFilesInfo{InputSDFileRef} = \*SDFILE;
 603 
 604 }
 605 
 606 # Close open input and output files...
 607 sub CloseInputAndOutputFiles {
 608   if ($SDFilesInfo{NewSDFileRef}) {
 609     close $SDFilesInfo{NewSDFileRef};
 610   }
 611   if ($SDFilesInfo{NewTextFileRef}) {
 612     close $SDFilesInfo{NewTextFileRef};
 613   }
 614 
 615   if ($SDFilesInfo{InputSDFileRef}) {
 616     close $SDFilesInfo{InputSDFileRef};
 617   }
 618 
 619   $SDFilesInfo{NewTextFileRef} = undef;
 620   $SDFilesInfo{NewSDFileRef} = undef;
 621   $SDFilesInfo{InputSDFileRef} = undef;
 622 }
 623 
 624 # Write out column labels for text file...
 625 sub WriteTextFileColLabels {
 626   my($ColLabelsLine, $NewTextFileRef);
 627 
 628   if (!$OptionsInfo{OutputTextFile}) {
 629     return;
 630   }
 631   $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
 632 
 633   if ($OptionsInfo{OutputStrDataString}) {
 634     # Append structure data string label...
 635     my(@DataLabels);
 636 
 637     @DataLabels = ();
 638     push @DataLabels, @{$SDFilesInfo{DataLabels}};
 639     push @DataLabels, "StructureDataString";
 640 
 641     $ColLabelsLine = JoinWords(\@DataLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 642   }
 643   else {
 644     $ColLabelsLine = JoinWords(\@{$SDFilesInfo{DataLabels}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 645   }
 646   print $NewTextFileRef "$ColLabelsLine\n";
 647 }
 648 
 649 # Setup values for data fields...
 650 sub SetupDataValues {
 651   @{$SDFilesInfo{DataValues}} = map { exists $SDFilesInfo{DataFieldValues}{$_} ? $SDFilesInfo{DataFieldValues}{$_} : "" } @{$SDFilesInfo{DataLabels}};
 652 }
 653 
 654 # Write out structure data and specific data fields to SD file...
 655 sub WriteSDFileCmpdData {
 656   my($MolString, $Count, $NewSDFileRef);
 657 
 658   if (!$OptionsInfo{OutputSDFile}) {
 659     return;
 660   }
 661 
 662   $NewSDFileRef = $SDFilesInfo{NewSDFileRef};
 663 
 664   ($MolString) = split "M  END", $SDFilesInfo{CmpdString};
 665   $MolString .= "M  END";
 666   print $NewSDFileRef "$MolString\n";
 667 
 668   for $Count (0 .. $#{$SDFilesInfo{DataLabels}}) {
 669     print $NewSDFileRef ">  <$SDFilesInfo{DataLabels}[$Count]>\n$SDFilesInfo{DataValues}[$Count]\n\n";
 670   }
 671   print $NewSDFileRef "\$\$\$\$\n";
 672 }
 673 
 674 # Write out compound string...
 675 sub WriteSDFileCmpdString {
 676   my($NewSDFileRef);
 677 
 678   if (!$OptionsInfo{OutputSDFile}) {
 679     return;
 680   }
 681 
 682   $NewSDFileRef = $SDFilesInfo{NewSDFileRef};
 683   print $NewSDFileRef "$SDFilesInfo{CmpdString}\n";
 684 }
 685 
 686 # Write out data for text file...
 687 sub WriteTextFileCmpdData {
 688   my($DataValuesLine, $NewTextFileRef);
 689 
 690   if (!$OptionsInfo{OutputTextFile}) {
 691     return;
 692   }
 693 
 694   $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
 695   $DataValuesLine = JoinWords(\@{$SDFilesInfo{DataValues}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 696 
 697   # Handle multiple lines data values for data fields by joining 'em using semicolons...
 698   if ($DataValuesLine =~ /\n/) {
 699     $DataValuesLine =~ s/\n/;/g;
 700   }
 701 
 702   if ($OptionsInfo{OutputStrDataString}) {
 703     # Append structure data string...
 704     my($StrDataString, $OutQuoteValue, $OutDelim, $StrDataStringDelimiter);
 705 
 706     if ($OptionsInfo{StrDataStringWithFields}) {
 707       $StrDataString = $SDFilesInfo{CmpdString};
 708     }
 709     else {
 710       ($StrDataString) = split "M  END", $SDFilesInfo{CmpdString};
 711       $StrDataString .= "M  END";
 712     }
 713     $StrDataStringDelimiter = $OptionsInfo{StrDataStringDelimiter};
 714     $StrDataString =~ s/\n/$StrDataStringDelimiter/g;
 715 
 716     $OutDelim = $OptionsInfo{OutDelim};
 717     $OutQuoteValue = $OptionsInfo{OutQuote} ? "\"" : "";
 718 
 719     print $NewTextFileRef "$DataValuesLine${OutDelim}${OutQuoteValue}${StrDataString}${OutQuoteValue}\n";
 720   }
 721   else {
 722     print $NewTextFileRef "$DataValuesLine\n";
 723   }
 724 }
 725 
 726 # Retrieve information about input SD files...
 727 sub RetrieveSDFilesInfo {
 728   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount);
 729 
 730   %SDFilesInfo = ();
 731 
 732   @{$SDFilesInfo{FileOkay}} = ();
 733   @{$SDFilesInfo{CmpdCount}} = ();
 734   @{$SDFilesInfo{NewTextFileName}} = ();
 735   @{$SDFilesInfo{NewSDFileName}} = ();
 736 
 737   @{$SDFilesInfo{AllDataFieldLabels}} = ();
 738   @{$SDFilesInfo{CommonDataFieldLabels}} = ();
 739 
 740   FILELIST: for $Index (0 .. $#SDFilesList) {
 741     $SDFile = $SDFilesList[$Index];
 742 
 743     $SDFilesInfo{FileOkay}[$Index] = 0;
 744 
 745     $SDFilesInfo{CmpdCount}[$Index] = 0;
 746     $SDFilesInfo{NewTextFileName}[$Index] = "";
 747     $SDFilesInfo{NewSDFileName}[$Index] = "";
 748 
 749     @{$SDFilesInfo{AllDataFieldLabels}[$Index]} = ();
 750     @{$SDFilesInfo{CommonDataFieldLabels}[$Index]} = ();
 751 
 752     if (!(-e $SDFile)) {
 753       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 754       next FILELIST;
 755     }
 756 
 757     if (!CheckFileType($SDFile, "sd sdf")) {
 758       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 759       next FILELIST;
 760     }
 761 
 762     # Generate appropriate name for the new output file.
 763     $FileDir = ""; $FileName = ""; $FileExt = "";
 764     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 765     $NewFileName = $FileName;
 766     $NewFileName = $FileName  . $OptionsInfo{FileNameMode};
 767     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 768       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 769       if ($RootFileName && $RootFileExt) {
 770         $NewFileName = $RootFileName;
 771       }
 772       else {
 773         $NewFileName = $OptionsInfo{OutFileRoot};
 774       }
 775     }
 776     $NewSDFileName = $NewFileName . ".$OptionsInfo{SDFileExt}";
 777     $NewTextFileName = $NewFileName . ".$OptionsInfo{TextFileExt}";
 778 
 779     if ($OptionsInfo{OutputSDFile}) {
 780       if (lc($NewSDFileName) eq lc($SDFile)) {
 781         warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
 782         print "Specify a different name using \"-r --root\" option or use default name.\n";
 783         next FILELIST;
 784       }
 785     }
 786 
 787     if (!$OptionsInfo{Overwrite}) {
 788       if ($OptionsInfo{OutputSDFile}) {
 789         if (-e $NewSDFileName) {
 790           warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n";
 791           next FILELIST;
 792         }
 793       }
 794       if ($OptionsInfo{OutputTextFile}) {
 795         if (-e $NewTextFileName) {
 796           warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n";
 797           next FILELIST;
 798         }
 799       }
 800     }
 801 
 802     if (!open SDFILE, "$SDFile") {
 803       warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 804       next FILELIST;
 805     }
 806 
 807     my($CountCmpds, $CollectDataFields);
 808     my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels);
 809 
 810     $CountCmpds = ($OptionsInfo{Mode} =~ /^randomcmpds$/i) ? 1 : 0;
 811 
 812     $CollectDataFields = (($OptionsInfo{Mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^(datafieldsbyvalue|datafieldsbyregex)$/i  && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldbylist$/i  && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafielduniquebylist$/i  && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldnotbylist$/i  && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnum$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnums$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordrange$/i && $OptionsInfo{OutputTextFile})) ? 1 : 0;
 813 
 814     $CmpdCount = 0;
 815     if ($CountCmpds || $CollectDataFields) {
 816       @DataFieldLabels = ();
 817       @CommonDataFieldLabels = ();
 818       %DataFieldLabelsMap = ();
 819       CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 820         $CmpdCount++;
 821         if ($OptionsInfo{Mode} =~ /^recordnum$/i) {
 822           if ($CmpdCount == $OptionsInfo{RecordNum}) {
 823             @CmpdLines = split "\n", $CmpdString;
 824             @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 825             last CMPDSTRING;
 826           }
 827         }
 828         if ($CollectDataFields) {
 829           my($Label);
 830           @CmpdLines = split "\n", $CmpdString;
 831           # Process compound data header labels and figure out which ones are present for
 832           # all the compounds...
 833           if (@DataFieldLabels) {
 834             my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines);
 835             my(%CmpdDataFieldLabelsMap) = ();
 836             # Setup a map for the current labels...
 837             for $Label (@CmpdDataFieldLabels) {
 838               $CmpdDataFieldLabelsMap{$Label} = "PresentInSome";
 839             }
 840             # Check the presence old labels for this compound; otherwise, mark 'em new...
 841             for $Label (@DataFieldLabels) {
 842               if (!$CmpdDataFieldLabelsMap{$Label}) {
 843                 $DataFieldLabelsMap{$Label} = "PresentInSome";
 844               }
 845             }
 846             # Check the presence this compound in the old labels; otherwise, add 'em...
 847             for $Label (@CmpdDataFieldLabels ) {
 848               if (!$DataFieldLabelsMap{$Label}) {
 849                 # It's a new label...
 850                 push @DataFieldLabels, $Label;
 851                 $DataFieldLabelsMap{$Label} = "PresentInSome";
 852               }
 853             }
 854           }
 855           else {
 856             # Get the initial label set and set up a map...
 857             @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 858             for $Label (@DataFieldLabels) {
 859               $DataFieldLabelsMap{$Label} = "PresentInAll";
 860             }
 861           }
 862           # Identify the common data field labels...
 863           if ($Options{mode} =~ /^commondatafields$/i) {
 864             @CommonDataFieldLabels = ();
 865             for $Label (@DataFieldLabels) {
 866               if ($DataFieldLabelsMap{$Label} eq "PresentInAll") {
 867                 push @CommonDataFieldLabels, $Label;
 868               }
 869             }
 870           }
 871         }
 872       }
 873     }
 874 
 875     $SDFilesInfo{FileOkay}[$Index] = 1;
 876 
 877     $SDFilesInfo{NewTextFileName}[$Index] = $NewTextFileName;
 878     $SDFilesInfo{NewSDFileName}[$Index] = $NewSDFileName;
 879 
 880     $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount;
 881 
 882     push @{$SDFilesInfo{AllDataFieldLabels}[$Index]}, @DataFieldLabels;
 883     push @{$SDFilesInfo{CommonDataFieldLabels}[$Index]}, @CommonDataFieldLabels;
 884 
 885     close SDFILE;
 886   }
 887 }
 888 
 889 # Process options...
 890 sub ProcessOptions {
 891   %OptionsInfo = ();
 892 
 893   $OptionsInfo{Mode} = $Options{mode};
 894 
 895   $OptionsInfo{InDelim} = "\,";
 896   if ($Options{indelim} =~ /^semicolon$/i) {
 897     $OptionsInfo{InDelim} = "\;";
 898   }
 899   elsif ($Options{indelim} =~ /^tab$/i) {
 900     $OptionsInfo{InDelim} = "\t";
 901   }
 902 
 903   $OptionsInfo{OutDelim} = "\,";
 904   if ($Options{outdelim} =~ /^semicolon$/i) {
 905     $OptionsInfo{OutDelim} = "\;";
 906   }
 907   elsif ($Options{outdelim} =~ /^tab$/i) {
 908     $OptionsInfo{OutDelim} = "\t";
 909   }
 910 
 911   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
 912 
 913   $OptionsInfo{RegexIgnoreCase} = ($Options{regexignorecase} =~ /^yes$/i) ? 1 : 0;
 914 
 915   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef;
 916   $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef;
 917 
 918   $OptionsInfo{NumOfCmpds} = $Options{numofcmpds};
 919 
 920   $OptionsInfo{ValueComparisonMode} = $Options{valuecomparisonmode};
 921   $OptionsInfo{NumericalComparison} = ($Options{valuecomparisonmode} =~ /^Numeric$/i) ? 1 : 0;
 922 
 923   $OptionsInfo{Violations} = $Options{violations};
 924   $OptionsInfo{Seed} = $Options{seed};
 925 
 926 
 927   if ($Options{mode} =~ /^(datafields|datafieldsbyregex|datafieldsbyvalue|datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) {
 928     if ($Options{datafields} || $Options{datafieldsfile}) {
 929       if ($Options{datafields} && $Options{datafieldsfile}) {
 930         die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n";
 931       }
 932     }
 933     else {
 934       die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n";
 935     }
 936   }
 937   $OptionsInfo{DataFields} = $Options{datafields} ? $Options{datafields} : undef;
 938   $OptionsInfo{DataFieldsFile} = $Options{datafieldsfile} ? $Options{datafieldsfile} : undef;
 939 
 940   $OptionsInfo{RecordNum} = 0; $OptionsInfo{StartRecordNum} = 0; $OptionsInfo{EndRecordNum} = 0;
 941 
 942   %{$OptionsInfo{RecordNums}} = ();
 943   $OptionsInfo{RecordNumsMin} = 0; $OptionsInfo{RecordNumsMax} = 0; $OptionsInfo{RecordNumsCount} = 0;
 944 
 945   $OptionsInfo{Record} = $Options{record} ? $Options{record} : undef;
 946 
 947   if ($Options{mode} =~ /^(recordnum|recordnums|recordrange)$/i) {
 948     if ($Options{record}) {
 949       my($Record, @RecordSplit);
 950 
 951       $Record = $Options{record};
 952       $Record =~ s/ //g;
 953 
 954       @RecordSplit = split ",", $Record;
 955 
 956       if ($Options{mode} =~ /^recordnum$/i ) {
 957         if (@RecordSplit == 1) {
 958           $OptionsInfo{RecordNum} = $RecordSplit[0];
 959           if ($OptionsInfo{RecordNum} <= 0) {
 960             die "Error: The value specified, $OptionsInfo{RecordNum},  for option \"--records\" is not valid. Allowed values: > 0 \n";
 961           }
 962         }
 963         else {
 964           die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n";
 965         }
 966       }
 967       elsif ($Options{mode} =~ /^recordnums$/i ) {
 968         my($RecordNum, $RecordCount, @SortedRecordSplit);
 969 
 970         @SortedRecordSplit = sort { $a <=> $b } @RecordSplit;
 971 
 972         $RecordCount = 0;
 973         RECORDNUM: for $RecordNum (@SortedRecordSplit) {
 974           if (exists $OptionsInfo{RecordNums}{$RecordNum}) {
 975             next RECORDNUM;
 976           }
 977           $RecordCount++;
 978           $OptionsInfo{RecordNums}{$RecordNum} = $RecordNum;
 979         }
 980         $OptionsInfo{RecordNumsCount} = $RecordCount;
 981         $OptionsInfo{RecordNumsMin} = $SortedRecordSplit[0];
 982         $OptionsInfo{RecordNumsMax} = $SortedRecordSplit[$#SortedRecordSplit];
 983       }
 984       else {
 985         if (@RecordSplit == 2) {
 986           $OptionsInfo{StartRecordNum} = $RecordSplit[0];
 987           $OptionsInfo{EndRecordNum} = $RecordSplit[1];
 988           if ($OptionsInfo{StartRecordNum} <= 0 || $OptionsInfo{EndRecordNum} <= 0) {
 989             die "Error: The value pair specified, $Options{record},  for option \"--records\" is not valid. Allowed values: > 0 \n";
 990           }
 991         }
 992         else {
 993           die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n";
 994         }
 995         if ($OptionsInfo{StartRecordNum} > $OptionsInfo{EndRecordNum}) {
 996           die "Error: Start record number, $OptionsInfo{StartRecordNum}, must be smaller than end record number, $OptionsInfo{EndRecordNum}.\nSpecify different values using \"--record\" option.\n";
 997         }
 998       }
 999     }
1000     else {
1001       die "Error: For \"-m --mode\" option values recordnum, recordnums or recordrange, specify \"--record\" option value.\n";
1002     }
1003   }
1004 
1005   @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
1006 
1007   my(@Words, $Line, $Value);
1008   if ($Options{mode} =~ /^datafields$/i) {
1009     @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
1010     if ($Options{datafields}) {
1011       @{$OptionsInfo{SpecifiedDataFieldLabels}} = split $OptionsInfo{InDelim}, $Options{datafields};
1012     }
1013     elsif ($Options{datafieldsfile}) {
1014       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
1015       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
1016         @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
1017         if (@Words) {
1018           push @{$OptionsInfo{SpecifiedDataFieldLabels}}, @Words;
1019         }
1020       }
1021       close DATAFIELDSFILE;
1022     }
1023   }
1024   elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) {
1025     my(@DataFieldsByValueTriplets);
1026     @DataFieldsByValueTriplets = ();
1027     if ($Options{datafields}) {
1028       @DataFieldsByValueTriplets = split $OptionsInfo{InDelim}, $Options{datafields};
1029     }
1030     elsif ($Options{datafieldsfile}) {
1031       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
1032       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
1033         @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
1034         if (@Words) {
1035           push @DataFieldsByValueTriplets, @Words;
1036         }
1037       }
1038       close DATAFIELDSFILE;
1039     }
1040     if ((@DataFieldsByValueTriplets % 3)) {
1041       if ($Options{datafields}) {
1042         die "Error: Triplets not found in values specified by \"-d --datafields\" option\n";
1043       }
1044       elsif ($Options{datafieldsfile}) {
1045         die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n";
1046       }
1047     }
1048     my($Index, $Label, $Value, $Criterion);
1049 
1050     @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
1051     %{$OptionsInfo{SpecifiedDataFieldValuesMap}} = ();
1052     %{$OptionsInfo{SpecifiedDataFieldCriteriaMap}} = ();
1053 
1054     for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) {
1055       $Label = $DataFieldsByValueTriplets[$Index];
1056       $Value = $DataFieldsByValueTriplets[$Index + 1];
1057       $Criterion = $DataFieldsByValueTriplets[$Index + 2];
1058 
1059       if ($Criterion =~ /^(eq|le|ge)$/i) {
1060         push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label;
1061         $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label} = $Value;
1062         $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label} = $Criterion;
1063       }
1064       else {
1065         warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n";
1066       }
1067     }
1068   }
1069   elsif ($Options{mode} =~ /^datafieldsbyregex$/i) {
1070     my(@DataFieldsByRegexTriplets);
1071 
1072     @DataFieldsByRegexTriplets = ();
1073     if ($Options{datafields}) {
1074       @DataFieldsByRegexTriplets = quotewords($OptionsInfo{InDelim}, 0, $Options{datafields});
1075     }
1076     elsif ($Options{datafieldsfile}) {
1077       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
1078       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
1079           @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
1080           if (@Words) {
1081             push @DataFieldsByRegexTriplets, @Words;
1082           }
1083       }
1084       close DATAFIELDSFILE;
1085     }
1086     if ((@DataFieldsByRegexTriplets % 3)) {
1087       if ($Options{datafields}) {
1088           die "Error: Triplet not found in values specified by \"-d --datafields\" option\n";
1089       }
1090       elsif ($Options{datafieldsfile}) {
1091           die "Error: Triplet not found in values specified by \"--datafieldsfile\" option\n";
1092       }
1093     }
1094 
1095     my($Index, $Label, $Value, $Criterion);
1096 
1097     @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
1098     %{$OptionsInfo{SpecifiedDataFieldRegexMap}} = ();
1099     %{$OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}} = ();
1100 
1101     for ($Index = 0; $Index < @DataFieldsByRegexTriplets; $Index = $Index + 3) {
1102       $Label = $DataFieldsByRegexTriplets[$Index];
1103       $Value = $DataFieldsByRegexTriplets[$Index + 1];
1104       $Criterion = $DataFieldsByRegexTriplets[$Index + 2];
1105 
1106       if ($Criterion =~ /^(eq|ne)$/i) {
1107           push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label;
1108           $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label} = $Value;
1109           $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label} = $Criterion;
1110       }
1111       else {
1112           warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion; Supported values: eq or ne\n";
1113       }
1114     }
1115   }
1116   elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) {
1117     my($Index, @DataFieldAndValuesList);
1118     if ($Options{datafields}) {
1119       @DataFieldAndValuesList = split $OptionsInfo{InDelim}, $Options{datafields};
1120     }
1121     elsif ($Options{datafieldsfile}) {
1122       open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
1123       while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
1124         @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
1125         if (@Words) {
1126           push @DataFieldAndValuesList, @Words;
1127         }
1128       }
1129       close DATAFIELDSFILE;
1130     }
1131     if (@DataFieldAndValuesList < 2) {
1132       if ($Options{datafields}) {
1133         die "Error: Invalid number of values specified by \"-d --datafields\" option\n";
1134       }
1135       elsif ($Options{datafieldsfile}) {
1136         die "Error: Invalid number values specified by \"--datafieldsfile\" option\n";
1137       }
1138     }
1139 
1140     $OptionsInfo{SpecifiedDataFieldLabel} = $DataFieldAndValuesList[0];
1141     $OptionsInfo{SpecifiedDataFieldValuesCount} = @DataFieldAndValuesList - 1;
1142     %{$OptionsInfo{SpecifiedDataFieldValues}} = ();
1143 
1144     for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) {
1145       $Value = $DataFieldAndValuesList[$Index];
1146       $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound";
1147     }
1148   }
1149 
1150   $OptionsInfo{SDFileExt} = "sdf";
1151   $OptionsInfo{TextFileExt} = "csv";
1152 
1153   if ($Options{outdelim} =~ /^tab$/i) {
1154     $OptionsInfo{TextFileExt} = "tsv";
1155   }
1156 
1157   if ($Options{mode} =~ /^(alldatafields|molnames)$/i) {
1158     $OptionsInfo{OutputSDFile} = 0;
1159     $OptionsInfo{OutputTextFile} = 1;
1160   }
1161   else {
1162     $OptionsInfo{OutputSDFile} = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0;
1163     $OptionsInfo{OutputTextFile} = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0;
1164   }
1165 
1166   $OptionsInfo{StrDataString} = $Options{strdatastring};
1167   $OptionsInfo{OutputStrDataString} = ($Options{strdatastring} =~ /^Yes$/i) ? 1 : 0;
1168 
1169   $OptionsInfo{StrDataStringDelimiter} = $Options{strdatastringdelimiter};
1170 
1171   if (IsEmpty($Options{strdatastringdelimiter})) {
1172     die "Error: No value specified for \"--StrDataStringDelimiter\" option.\n";
1173   }
1174   $OptionsInfo{StrDataStringMode} = $Options{strdatastringmode};
1175   $OptionsInfo{StrDataStringWithFields} = $Options{strdatastringmode} =~ /^StrAndDataFields$/i ? 1 : 0;
1176 
1177   MODE: {
1178     if ($Options{mode} =~ /^alldatafields$/i) { $OptionsInfo{FileNameMode} = "AllDataDields"; last MODE; }
1179     if ($Options{mode} =~ /^commondatafields$/i) { $OptionsInfo{FileNameMode} = "CommonDataDields"; last MODE; }
1180     if ($Options{mode} =~ /^datafields$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFields"; last MODE; }
1181     if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByValue"; last MODE; }
1182     if ($Options{mode} =~ /^datafieldsbyregex$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByRegex"; last MODE; }
1183     if ($Options{mode} =~ /^datafieldbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataField"; last MODE; }
1184     if ($Options{mode} =~ /^datafielduniquebylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedUniqueDataField"; last MODE; }
1185     if ($Options{mode} =~ /^datafieldnotbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldNotByList"; last MODE; }
1186     if ($Options{mode} =~ /^molnames$/i) { $OptionsInfo{FileNameMode} = "MolName"; last MODE; }
1187     if ($Options{mode} =~ /^randomcmpds$/i) { $OptionsInfo{FileNameMode} = "RandomCmpds"; last MODE; }
1188     if ($Options{mode} =~ /^recordnum$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{RecordNum}"; last MODE; }
1189     if ($Options{mode} =~ /^recordnums$/i) { $OptionsInfo{FileNameMode} = "RecordNums"; last MODE; }
1190     if ($Options{mode} =~ /^recordrange$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{StartRecordNum}" . "To" . "$OptionsInfo{EndRecordNum}"; last MODE; }
1191     if ($Options{mode} =~ /^2dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "2DCmpdRecords"; last MODE; }
1192     if ($Options{mode} =~ /^3dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "3DCmpdRecords"; last MODE; }
1193     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, , datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
1194   }
1195 
1196 }
1197 
1198 # Setup script usage  and retrieve command line arguments specified using various options...
1199 sub SetupScriptUsage {
1200 
1201   # Retrieve all the options...
1202   %Options = ();
1203   $Options{numofcmpds} = 1;
1204   $Options{mode} = "alldatafields";
1205   $Options{indelim} = "comma";
1206   $Options{outdelim} = "comma";
1207   $Options{output} = "SD";
1208   $Options{quote} = "yes";
1209   $Options{regexignorecase} = "yes";
1210   $Options{valuecomparisonmode} = "numeric";
1211   $Options{violations} = 0;
1212   $Options{seed} = 123456789;
1213 
1214   $Options{strdatastring} = "no";
1215   $Options{strdatastringdelimiter} = "|";
1216   $Options{strdatastringmode} = "StrOnly";
1217 
1218   if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "regexignorecase=s", "record=s", "root|r=s", "seed|s=i", "strdatastring=s", "strdatastringdelimiter=s", "strdatastringmode=s", "valuecomparisonmode=s", "violations|v=i", "workingdir|w=s")) {
1219     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
1220   }
1221   if ($Options{workingdir}) {
1222     if (! -d $Options{workingdir}) {
1223       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
1224     }
1225     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
1226   }
1227   if ($Options{numofcmpds} < 1) {
1228     die "Error: The value specified, $Options{numofcmpds},  for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n";
1229   }
1230   if ($Options{valuecomparisonmode} !~ /^(Numeric|Alphanumeric)$/i) {
1231     die "Error: The value specified, $Options{valuecomparisonmode}, for option \"--ValueComparisonMode\" is not valid. Allowed values: Numeric or Alphanumeric\n";
1232   }
1233   if ($Options{violations} < 0) {
1234     die "Error: The value specified, $Options{violations},  for option \"-v --violations\" is not valid. Allowed values: >= 0 \n";
1235   }
1236   if ($Options{mode} !~ /^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldsbyregex|datafieldbylist|datafielduniquebylist|datafieldnotbylist|molnames|randomcmpds|recordnum|recordnums|recordrange|2dcmpdrecords|3dcmpdrecords)$/i) {
1237     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
1238   }
1239   if ($Options{output} !~ /^(SD|text|both)$/i) {
1240     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n";
1241   }
1242   if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) {
1243     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1244   }
1245   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
1246     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1247   }
1248   if ($Options{quote} !~ /^(yes|no)$/i) {
1249     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
1250   }
1251   if ($Options{regexignorecase} !~ /^(yes|no)$/i) {
1252     die "Error: The value specified, $Options{regexignorecase}, for option \"--regexignorecase\" is not valid. Allowed values: yes or no\n";
1253   }
1254   if ($Options{strdatastring} !~ /^(yes|no)$/i) {
1255     die "Error: The value specified, $Options{strdatastring}, for option \"--StrDataString\" is not valid. Allowed values: yes or no\n";
1256   }
1257   if ($Options{strdatastringmode} !~ /^(StrOnly|StrAndDataFields)$/i) {
1258     die "Error: The value specified, $Options{strdatastringmode}, for option \"--StrDataStringMode\" is not valid. Allowed values: StrOnly or StrAndDataFields\n";
1259   }
1260 }
1261