1 #!/usr/bin/perl -w 2 # 3 # File: ExtractFromSDFiles.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use SDFileUtil; 33 use FileUtil; 34 use TextUtil; 35 36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 37 38 # Autoflush STDOUT 39 $| = 1; 40 41 # Starting message... 42 $ScriptName = basename($0); 43 print "\n$ScriptName:Starting...\n\n"; 44 $StartTime = new Benchmark; 45 46 # Get the options and setup script... 47 SetupScriptUsage(); 48 if ($Options{help} || @ARGV < 1) { 49 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 50 } 51 52 my(@SDFilesList); 53 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 54 55 # Process options... 56 print "Processing options...\n"; 57 my(%OptionsInfo); 58 ProcessOptions(); 59 60 # Collect information about SD files... 61 print "Checking input SD file(s)...\n"; 62 my(%SDFilesInfo); 63 RetrieveSDFilesInfo(); 64 65 # Generate output files... 66 my($FileIndex); 67 if (@SDFilesList > 1) { 68 print "\nProcessing SD files...\n"; 69 } 70 for $FileIndex (0 .. $#SDFilesList) { 71 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 72 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 73 ExtractFromSDFile($FileIndex); 74 } 75 } 76 print "\n$ScriptName:Done...\n\n"; 77 78 $EndTime = new Benchmark; 79 $TotalTime = timediff ($EndTime, $StartTime); 80 print "Total time: ", timestr($TotalTime), "\n"; 81 82 ############################################################################### 83 84 # Extract data from a SD file... 85 sub ExtractFromSDFile { 86 my($FileIndex) = @_; 87 88 OpenInputAndOutputFiles($FileIndex); 89 90 MODE: { 91 if ($OptionsInfo{Mode} =~ /^AllDataFields$/i) { 92 ExtractAllDataFields($FileIndex); 93 last MODE; 94 } 95 if ($OptionsInfo{Mode} =~ /^CommonDataFields$/i) { 96 ExtractCommonDataFields($FileIndex); 97 last MODE; 98 } 99 if ($OptionsInfo{Mode} =~ /^DataFields$/i) { 100 ExtractDataFields($FileIndex); 101 last MODE; 102 } 103 if ($OptionsInfo{Mode} =~ /^(DataFieldByList|DatafieldUniqueByList)$/i) { 104 ExtractDataFieldByList($FileIndex); 105 last MODE; 106 } 107 if ($OptionsInfo{Mode} =~ /^DataFieldNotByList$/i) { 108 ExtractDataFieldNotByList($FileIndex); 109 last MODE; 110 } 111 if ($OptionsInfo{Mode} =~ /^DataFieldsByValue$/i) { 112 ExtractDataFieldsByValue($FileIndex); 113 last MODE; 114 } 115 if ($OptionsInfo{Mode} =~ /^DataFieldsByRegex$/i) { 116 ExtractDataFieldsByRegex($FileIndex); 117 last MODE; 118 } 119 if ($OptionsInfo{Mode} =~ /^RandomCmpds$/i) { 120 ExtractRandomCompounds($FileIndex); 121 last MODE; 122 } 123 if ($OptionsInfo{Mode} =~ /^MolNames$/i) { 124 ExtractMolNames($FileIndex); 125 last MODE; 126 } 127 if ($OptionsInfo{Mode} =~ /^RecordNum$/i) { 128 ExtractRecordNum($FileIndex); 129 last MODE; 130 } 131 if ($OptionsInfo{Mode} =~ /^RecordNums$/i) { 132 ExtractRecordNums($FileIndex); 133 last MODE; 134 } 135 if ($OptionsInfo{Mode} =~ /^RecordRange$/i) { 136 ExtractRecordRange($FileIndex); 137 last MODE; 138 } 139 if ($OptionsInfo{Mode} =~ /^2DCmpdRecords$/i) { 140 Extract2DCmpdRecords($FileIndex); 141 last MODE; 142 } 143 if ($OptionsInfo{Mode} =~ /^3DCmpdRecords$/i) { 144 Extract3DCmpdRecords($FileIndex); 145 last MODE; 146 } 147 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; 148 } 149 150 CloseInputAndOutputFiles(); 151 } 152 153 # Extract all data fields... 154 sub ExtractAllDataFields { 155 my($FileIndex) = @_; 156 my(@CmpdLines); 157 158 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 159 WriteTextFileColLabels(); 160 161 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 162 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 163 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 164 165 SetupDataValues(); 166 WriteTextFileCmpdData(); 167 WriteSDFileCmpdData(); 168 } 169 } 170 171 # Extract common data fields... 172 sub ExtractCommonDataFields { 173 my($FileIndex) = @_; 174 my(@CmpdLines); 175 176 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{CommonDataFieldLabels}[$FileIndex]}; 177 WriteTextFileColLabels(); 178 179 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 180 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 181 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 182 183 SetupDataValues(); 184 WriteTextFileCmpdData(); 185 WriteSDFileCmpdData(); 186 } 187 } 188 189 # Extract specified data fields... 190 sub ExtractDataFields { 191 my($FileIndex) = @_; 192 my(@CmpdLines); 193 194 @{$SDFilesInfo{DataLabels}} = @{$OptionsInfo{SpecifiedDataFieldLabels}}; 195 WriteTextFileColLabels(); 196 197 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 198 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 199 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 200 201 SetupDataValues(); 202 WriteTextFileCmpdData(); 203 WriteSDFileCmpdData(); 204 } 205 } 206 207 # Extract data fields using a list... 208 sub ExtractDataFieldByList { 209 my($FileIndex) = @_; 210 my($CmpdNum, $Value, $SpecifiedDataFieldValuesFoundCount, $CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines); 211 212 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 213 WriteTextFileColLabels(); 214 215 for $Value (keys %{$OptionsInfo{SpecifiedDataFieldValues}}) { 216 $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound"; 217 } 218 $SpecifiedDataFieldValuesFoundCount = 0; 219 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; 220 221 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 222 $CmpdNum++; 223 224 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 225 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 226 227 if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) { 228 next CMPDSTRING; 229 } 230 231 SetupDataValues(); 232 233 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; 234 $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}; 235 236 if (exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) { 237 if ($SpecifiedDataFieldValuesFoundCount < $OptionsInfo{SpecifiedDataFieldValuesCount}) { 238 if ($OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} eq "NotFound") { 239 $SpecifiedDataFieldValuesFoundCount++; 240 $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} = "Found"; 241 if ($OptionsInfo{Mode} =~ /^DataFieldUniqueByList$/i) { 242 WriteSDFileCmpdString(); 243 WriteTextFileCmpdData(); 244 } 245 } 246 if ($OptionsInfo{Mode} =~ /^DataFieldByList$/i) { 247 WriteSDFileCmpdString(); 248 WriteTextFileCmpdData(); 249 } 250 } 251 if ($SpecifiedDataFieldValuesFoundCount >= $OptionsInfo{SpecifiedDataFieldValuesCount}) { 252 last CMPDSTRING; 253 } 254 } 255 } 256 } 257 258 # Extract data field whose values are not on the specified list... 259 sub ExtractDataFieldNotByList { 260 my($FileIndex) = @_; 261 my($CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines); 262 263 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 264 WriteTextFileColLabels(); 265 266 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; 267 268 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 269 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 270 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 271 272 if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) { 273 next CMPDSTRING; 274 } 275 276 SetupDataValues(); 277 278 $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}; 279 280 # Make sure the current value is not empty and is not only specified list of values... 281 if (IsEmpty($CurrentValue) || exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) { 282 next CMPDSTRING; 283 } 284 285 WriteSDFileCmpdString(); 286 WriteTextFileCmpdData(); 287 } 288 } 289 290 # Extract data fields by value... 291 sub ExtractDataFieldsByValue { 292 my($FileIndex) = @_; 293 my($Label, $CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing, @CmpdLines); 294 295 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 296 WriteTextFileColLabels(); 297 298 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 299 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 300 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 301 302 SetupDataValues(); 303 $ViolationCount = 0; 304 305 for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) { 306 if (exists $SDFilesInfo{DataFieldValues}{$Label}) { 307 $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label}; 308 $SpecifiedCriterion = $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label}; 309 $SpecifiedValue = $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label}; 310 311 if ($OptionsInfo{NumericalComparison}) { 312 CRITERION: { 313 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue != $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 314 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue > $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 315 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue < $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 316 $Nothing = 1; 317 } 318 } 319 else { 320 CRITERION: { 321 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 322 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 323 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 324 $Nothing = 1; 325 } 326 } 327 } 328 } 329 if ($ViolationCount <= $OptionsInfo{Violations}) { 330 WriteSDFileCmpdString(); 331 WriteTextFileCmpdData(); 332 } 333 } 334 } 335 336 # Extract data fields by value using regular expression match... 337 sub ExtractDataFieldsByRegex { 338 my($FileIndex) = @_; 339 my($Label, $CurrentValue, $SpecifiedRegexCriterion, $SpecifiedRegex, $ViolationCount, $Nothing, @CmpdLines); 340 341 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 342 WriteTextFileColLabels(); 343 344 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 345 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 346 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 347 348 SetupDataValues(); 349 $ViolationCount = 0; 350 351 for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) { 352 if (exists $SDFilesInfo{DataFieldValues}{$Label}) { 353 $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label}; 354 $SpecifiedRegexCriterion = $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label}; 355 $SpecifiedRegex = $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label}; 356 357 if ($OptionsInfo{RegexIgnoreCase}) { 358 CRITERION: { 359 if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } } 360 if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } } 361 $Nothing = 1; 362 } 363 } 364 else { 365 CRITERION: { 366 if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } } 367 if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } } 368 $Nothing = 1; 369 } 370 } 371 } 372 } 373 if ($ViolationCount <= $OptionsInfo{Violations}) { 374 WriteSDFileCmpdString(); 375 WriteTextFileCmpdData(); 376 } 377 } 378 } 379 380 # Extract random compounds... 381 sub ExtractRandomCompounds { 382 my($FileIndex) = @_; 383 my($CmpdNum, $CmpdCount, $RandomCycleCount, $RandomIndex, @CmpdLines, %RandomCmpdIndexMap); 384 385 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 386 WriteTextFileColLabels(); 387 388 $CmpdCount = $SDFilesInfo{CmpdCount}[$FileIndex]; 389 srand($OptionsInfo{Seed}); 390 $RandomCycleCount = 0; 391 392 %RandomCmpdIndexMap = (); 393 while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $OptionsInfo{NumOfCmpds}) { 394 $RandomCycleCount++; 395 $RandomIndex = int (rand $CmpdCount) + 1; 396 $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex; 397 } 398 399 $CmpdNum = 0; 400 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 401 $CmpdNum++; 402 if (!exists $RandomCmpdIndexMap{$CmpdNum}) { 403 next CMPDSTRING; 404 } 405 406 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 407 408 WriteSDFileCmpdString(); 409 410 if ($OptionsInfo{OutputTextFile}) { 411 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 412 SetupDataValues(); 413 WriteTextFileCmpdData(); 414 } 415 } 416 } 417 418 # Extract mol names... 419 sub ExtractMolNames { 420 my($FileIndex) = @_; 421 my($MolName, $NewTextFileRef, @CmpdLines); 422 423 push @{$SDFilesInfo{DataLabels}}, "MolName"; 424 WriteTextFileColLabels(); 425 426 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; 427 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 428 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 429 $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OptionsInfo{OutQuote}); 430 print $NewTextFileRef "$MolName\n"; 431 } 432 } 433 434 # Extract a specific compound record... 435 sub ExtractRecordNum { 436 my($FileIndex) = @_; 437 my($CmpdNum, @CmpdLines); 438 439 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 440 WriteTextFileColLabels(); 441 442 $CmpdNum = 0; 443 444 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 445 $CmpdNum++; 446 if ($CmpdNum != $OptionsInfo{RecordNum}) { 447 next CMPDSTRING; 448 } 449 450 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 451 WriteSDFileCmpdString(); 452 453 if ($OptionsInfo{OutputTextFile}) { 454 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 455 SetupDataValues(); 456 WriteTextFileCmpdData(); 457 } 458 last CMPDSTRING; 459 } 460 } 461 462 # Extract a specific compound records... 463 sub ExtractRecordNums { 464 my($FileIndex) = @_; 465 my($CmpdNum, $CmpdCount, @CmpdLines); 466 467 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 468 WriteTextFileColLabels(); 469 470 $CmpdNum = 0; 471 $CmpdCount = 0; 472 473 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 474 $CmpdNum++; 475 476 if (exists $OptionsInfo{RecordNums}{$CmpdNum}) { 477 $CmpdCount++; 478 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 479 480 WriteSDFileCmpdString(); 481 482 if ($OptionsInfo{OutputTextFile}) { 483 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 484 SetupDataValues(); 485 WriteTextFileCmpdData(); 486 } 487 } 488 elsif ($CmpdNum > $OptionsInfo{RecordNumsMax} || $CmpdCount >= $OptionsInfo{RecordNumsCount}) { 489 last CMPDSTRING; 490 } 491 } 492 } 493 494 495 # Extract compounds in a specific record range... 496 sub ExtractRecordRange { 497 my($FileIndex) = @_; 498 my($CmpdNum, @CmpdLines); 499 500 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 501 WriteTextFileColLabels(); 502 503 $CmpdNum = 0; 504 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 505 $CmpdNum++; 506 507 if ($CmpdNum >= $OptionsInfo{StartRecordNum} && $CmpdNum <= $OptionsInfo{EndRecordNum}) { 508 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 509 510 WriteSDFileCmpdString(); 511 512 if ($OptionsInfo{OutputTextFile}) { 513 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 514 SetupDataValues(); 515 WriteTextFileCmpdData(); 516 } 517 } 518 elsif ($CmpdNum > $OptionsInfo{EndRecordNum}) { 519 last CMPDSTRING; 520 } 521 } 522 } 523 524 # Extract 2D compound records... 525 sub Extract2DCmpdRecords { 526 my($FileIndex) = @_; 527 my(@CmpdLines); 528 529 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 530 WriteTextFileColLabels(); 531 532 533 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 534 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 535 if (!IsCmpd2D(\@CmpdLines)) { 536 next CMPDSTRING; 537 } 538 539 WriteSDFileCmpdString(); 540 541 if ($OptionsInfo{OutputTextFile}) { 542 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 543 SetupDataValues(); 544 WriteTextFileCmpdData(); 545 } 546 } 547 } 548 549 # Extract 3D compound records... 550 sub Extract3DCmpdRecords { 551 my($FileIndex) = @_; 552 my(@CmpdLines); 553 554 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 555 WriteTextFileColLabels(); 556 557 558 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 559 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 560 if (!IsCmpd3D(\@CmpdLines)) { 561 next CMPDSTRING; 562 } 563 564 WriteSDFileCmpdString(); 565 566 if ($OptionsInfo{OutputTextFile}) { 567 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 568 SetupDataValues(); 569 WriteTextFileCmpdData(); 570 } 571 } 572 } 573 574 575 # Open input and output files... 576 sub OpenInputAndOutputFiles { 577 my($FileIndex) = @_; 578 579 $SDFilesInfo{NewTextFileRef} = undef; 580 $SDFilesInfo{NewSDFileRef} = undef; 581 582 if ($OptionsInfo{OutputTextFile} && $OptionsInfo{OutputSDFile}) { 583 print "Generating files $SDFilesInfo{NewSDFileName}[$FileIndex] and $SDFilesInfo{NewTextFileName}[$FileIndex]...\n"; 584 } 585 elsif ($OptionsInfo{OutputSDFile}) { 586 print "Generating file $SDFilesInfo{NewSDFileName}[$FileIndex]...\n"; 587 } 588 else { 589 print "Generating file $SDFilesInfo{NewTextFileName}[$FileIndex]...\n"; 590 } 591 592 if ($OptionsInfo{OutputSDFile}) { 593 open NEWSDFILE, ">$SDFilesInfo{NewSDFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewSDFileName}[$FileIndex]: $! \n"; 594 $SDFilesInfo{NewSDFileRef} = \*NEWSDFILE; 595 } 596 if ($OptionsInfo{OutputTextFile}) { 597 open NEWTEXTFILE, ">$SDFilesInfo{NewTextFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewTextFileName}[$FileIndex]: $! \n"; 598 $SDFilesInfo{NewTextFileRef} = \*NEWTEXTFILE; 599 } 600 601 open SDFILE, "$SDFilesList[$FileIndex]" or die "Error: Couldn't open $SDFilesList[$FileIndex]: $! \n"; 602 $SDFilesInfo{InputSDFileRef} = \*SDFILE; 603 604 } 605 606 # Close open input and output files... 607 sub CloseInputAndOutputFiles { 608 if ($SDFilesInfo{NewSDFileRef}) { 609 close $SDFilesInfo{NewSDFileRef}; 610 } 611 if ($SDFilesInfo{NewTextFileRef}) { 612 close $SDFilesInfo{NewTextFileRef}; 613 } 614 615 if ($SDFilesInfo{InputSDFileRef}) { 616 close $SDFilesInfo{InputSDFileRef}; 617 } 618 619 $SDFilesInfo{NewTextFileRef} = undef; 620 $SDFilesInfo{NewSDFileRef} = undef; 621 $SDFilesInfo{InputSDFileRef} = undef; 622 } 623 624 # Write out column labels for text file... 625 sub WriteTextFileColLabels { 626 my($ColLabelsLine, $NewTextFileRef); 627 628 if (!$OptionsInfo{OutputTextFile}) { 629 return; 630 } 631 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; 632 633 if ($OptionsInfo{OutputStrDataString}) { 634 # Append structure data string label... 635 my(@DataLabels); 636 637 @DataLabels = (); 638 push @DataLabels, @{$SDFilesInfo{DataLabels}}; 639 push @DataLabels, "StructureDataString"; 640 641 $ColLabelsLine = JoinWords(\@DataLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 642 } 643 else { 644 $ColLabelsLine = JoinWords(\@{$SDFilesInfo{DataLabels}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 645 } 646 print $NewTextFileRef "$ColLabelsLine\n"; 647 } 648 649 # Setup values for data fields... 650 sub SetupDataValues { 651 @{$SDFilesInfo{DataValues}} = map { exists $SDFilesInfo{DataFieldValues}{$_} ? $SDFilesInfo{DataFieldValues}{$_} : "" } @{$SDFilesInfo{DataLabels}}; 652 } 653 654 # Write out structure data and specific data fields to SD file... 655 sub WriteSDFileCmpdData { 656 my($MolString, $Count, $NewSDFileRef); 657 658 if (!$OptionsInfo{OutputSDFile}) { 659 return; 660 } 661 662 $NewSDFileRef = $SDFilesInfo{NewSDFileRef}; 663 664 ($MolString) = split "M END", $SDFilesInfo{CmpdString}; 665 $MolString .= "M END"; 666 print $NewSDFileRef "$MolString\n"; 667 668 for $Count (0 .. $#{$SDFilesInfo{DataLabels}}) { 669 print $NewSDFileRef "> <$SDFilesInfo{DataLabels}[$Count]>\n$SDFilesInfo{DataValues}[$Count]\n\n"; 670 } 671 print $NewSDFileRef "\$\$\$\$\n"; 672 } 673 674 # Write out compound string... 675 sub WriteSDFileCmpdString { 676 my($NewSDFileRef); 677 678 if (!$OptionsInfo{OutputSDFile}) { 679 return; 680 } 681 682 $NewSDFileRef = $SDFilesInfo{NewSDFileRef}; 683 print $NewSDFileRef "$SDFilesInfo{CmpdString}\n"; 684 } 685 686 # Write out data for text file... 687 sub WriteTextFileCmpdData { 688 my($DataValuesLine, $NewTextFileRef); 689 690 if (!$OptionsInfo{OutputTextFile}) { 691 return; 692 } 693 694 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; 695 $DataValuesLine = JoinWords(\@{$SDFilesInfo{DataValues}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 696 697 # Handle multiple lines data values for data fields by joining 'em using semicolons... 698 if ($DataValuesLine =~ /\n/) { 699 $DataValuesLine =~ s/\n/;/g; 700 } 701 702 if ($OptionsInfo{OutputStrDataString}) { 703 # Append structure data string... 704 my($StrDataString, $OutQuoteValue, $OutDelim, $StrDataStringDelimiter); 705 706 if ($OptionsInfo{StrDataStringWithFields}) { 707 $StrDataString = $SDFilesInfo{CmpdString}; 708 } 709 else { 710 ($StrDataString) = split "M END", $SDFilesInfo{CmpdString}; 711 $StrDataString .= "M END"; 712 } 713 $StrDataStringDelimiter = $OptionsInfo{StrDataStringDelimiter}; 714 $StrDataString =~ s/\n/$StrDataStringDelimiter/g; 715 716 $OutDelim = $OptionsInfo{OutDelim}; 717 $OutQuoteValue = $OptionsInfo{OutQuote} ? "\"" : ""; 718 719 print $NewTextFileRef "$DataValuesLine${OutDelim}${OutQuoteValue}${StrDataString}${OutQuoteValue}\n"; 720 } 721 else { 722 print $NewTextFileRef "$DataValuesLine\n"; 723 } 724 } 725 726 # Retrieve information about input SD files... 727 sub RetrieveSDFilesInfo { 728 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount); 729 730 %SDFilesInfo = (); 731 732 @{$SDFilesInfo{FileOkay}} = (); 733 @{$SDFilesInfo{CmpdCount}} = (); 734 @{$SDFilesInfo{NewTextFileName}} = (); 735 @{$SDFilesInfo{NewSDFileName}} = (); 736 737 @{$SDFilesInfo{AllDataFieldLabels}} = (); 738 @{$SDFilesInfo{CommonDataFieldLabels}} = (); 739 740 FILELIST: for $Index (0 .. $#SDFilesList) { 741 $SDFile = $SDFilesList[$Index]; 742 743 $SDFilesInfo{FileOkay}[$Index] = 0; 744 745 $SDFilesInfo{CmpdCount}[$Index] = 0; 746 $SDFilesInfo{NewTextFileName}[$Index] = ""; 747 $SDFilesInfo{NewSDFileName}[$Index] = ""; 748 749 @{$SDFilesInfo{AllDataFieldLabels}[$Index]} = (); 750 @{$SDFilesInfo{CommonDataFieldLabels}[$Index]} = (); 751 752 if (!(-e $SDFile)) { 753 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 754 next FILELIST; 755 } 756 757 if (!CheckFileType($SDFile, "sd sdf")) { 758 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 759 next FILELIST; 760 } 761 762 # Generate appropriate name for the new output file. 763 $FileDir = ""; $FileName = ""; $FileExt = ""; 764 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 765 $NewFileName = $FileName; 766 $NewFileName = $FileName . $OptionsInfo{FileNameMode}; 767 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 768 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 769 if ($RootFileName && $RootFileExt) { 770 $NewFileName = $RootFileName; 771 } 772 else { 773 $NewFileName = $OptionsInfo{OutFileRoot}; 774 } 775 } 776 $NewSDFileName = $NewFileName . ".$OptionsInfo{SDFileExt}"; 777 $NewTextFileName = $NewFileName . ".$OptionsInfo{TextFileExt}"; 778 779 if ($OptionsInfo{OutputSDFile}) { 780 if (lc($NewSDFileName) eq lc($SDFile)) { 781 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 782 print "Specify a different name using \"-r --root\" option or use default name.\n"; 783 next FILELIST; 784 } 785 } 786 787 if (!$OptionsInfo{Overwrite}) { 788 if ($OptionsInfo{OutputSDFile}) { 789 if (-e $NewSDFileName) { 790 warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n"; 791 next FILELIST; 792 } 793 } 794 if ($OptionsInfo{OutputTextFile}) { 795 if (-e $NewTextFileName) { 796 warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n"; 797 next FILELIST; 798 } 799 } 800 } 801 802 if (!open SDFILE, "$SDFile") { 803 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 804 next FILELIST; 805 } 806 807 my($CountCmpds, $CollectDataFields); 808 my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels); 809 810 $CountCmpds = ($OptionsInfo{Mode} =~ /^randomcmpds$/i) ? 1 : 0; 811 812 $CollectDataFields = (($OptionsInfo{Mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^(datafieldsbyvalue|datafieldsbyregex)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafielduniquebylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldnotbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnum$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnums$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordrange$/i && $OptionsInfo{OutputTextFile})) ? 1 : 0; 813 814 $CmpdCount = 0; 815 if ($CountCmpds || $CollectDataFields) { 816 @DataFieldLabels = (); 817 @CommonDataFieldLabels = (); 818 %DataFieldLabelsMap = (); 819 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 820 $CmpdCount++; 821 if ($OptionsInfo{Mode} =~ /^recordnum$/i) { 822 if ($CmpdCount == $OptionsInfo{RecordNum}) { 823 @CmpdLines = split "\n", $CmpdString; 824 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 825 last CMPDSTRING; 826 } 827 } 828 if ($CollectDataFields) { 829 my($Label); 830 @CmpdLines = split "\n", $CmpdString; 831 # Process compound data header labels and figure out which ones are present for 832 # all the compounds... 833 if (@DataFieldLabels) { 834 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines); 835 my(%CmpdDataFieldLabelsMap) = (); 836 # Setup a map for the current labels... 837 for $Label (@CmpdDataFieldLabels) { 838 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome"; 839 } 840 # Check the presence old labels for this compound; otherwise, mark 'em new... 841 for $Label (@DataFieldLabels) { 842 if (!$CmpdDataFieldLabelsMap{$Label}) { 843 $DataFieldLabelsMap{$Label} = "PresentInSome"; 844 } 845 } 846 # Check the presence this compound in the old labels; otherwise, add 'em... 847 for $Label (@CmpdDataFieldLabels ) { 848 if (!$DataFieldLabelsMap{$Label}) { 849 # It's a new label... 850 push @DataFieldLabels, $Label; 851 $DataFieldLabelsMap{$Label} = "PresentInSome"; 852 } 853 } 854 } 855 else { 856 # Get the initial label set and set up a map... 857 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 858 for $Label (@DataFieldLabels) { 859 $DataFieldLabelsMap{$Label} = "PresentInAll"; 860 } 861 } 862 # Identify the common data field labels... 863 if ($Options{mode} =~ /^commondatafields$/i) { 864 @CommonDataFieldLabels = (); 865 for $Label (@DataFieldLabels) { 866 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") { 867 push @CommonDataFieldLabels, $Label; 868 } 869 } 870 } 871 } 872 } 873 } 874 875 $SDFilesInfo{FileOkay}[$Index] = 1; 876 877 $SDFilesInfo{NewTextFileName}[$Index] = $NewTextFileName; 878 $SDFilesInfo{NewSDFileName}[$Index] = $NewSDFileName; 879 880 $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount; 881 882 push @{$SDFilesInfo{AllDataFieldLabels}[$Index]}, @DataFieldLabels; 883 push @{$SDFilesInfo{CommonDataFieldLabels}[$Index]}, @CommonDataFieldLabels; 884 885 close SDFILE; 886 } 887 } 888 889 # Process options... 890 sub ProcessOptions { 891 %OptionsInfo = (); 892 893 $OptionsInfo{Mode} = $Options{mode}; 894 895 $OptionsInfo{InDelim} = "\,"; 896 if ($Options{indelim} =~ /^semicolon$/i) { 897 $OptionsInfo{InDelim} = "\;"; 898 } 899 elsif ($Options{indelim} =~ /^tab$/i) { 900 $OptionsInfo{InDelim} = "\t"; 901 } 902 903 $OptionsInfo{OutDelim} = "\,"; 904 if ($Options{outdelim} =~ /^semicolon$/i) { 905 $OptionsInfo{OutDelim} = "\;"; 906 } 907 elsif ($Options{outdelim} =~ /^tab$/i) { 908 $OptionsInfo{OutDelim} = "\t"; 909 } 910 911 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; 912 913 $OptionsInfo{RegexIgnoreCase} = ($Options{regexignorecase} =~ /^yes$/i) ? 1 : 0; 914 915 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef; 916 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef; 917 918 $OptionsInfo{NumOfCmpds} = $Options{numofcmpds}; 919 920 $OptionsInfo{ValueComparisonMode} = $Options{valuecomparisonmode}; 921 $OptionsInfo{NumericalComparison} = ($Options{valuecomparisonmode} =~ /^Numeric$/i) ? 1 : 0; 922 923 $OptionsInfo{Violations} = $Options{violations}; 924 $OptionsInfo{Seed} = $Options{seed}; 925 926 927 if ($Options{mode} =~ /^(datafields|datafieldsbyregex|datafieldsbyvalue|datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) { 928 if ($Options{datafields} || $Options{datafieldsfile}) { 929 if ($Options{datafields} && $Options{datafieldsfile}) { 930 die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; 931 } 932 } 933 else { 934 die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; 935 } 936 } 937 $OptionsInfo{DataFields} = $Options{datafields} ? $Options{datafields} : undef; 938 $OptionsInfo{DataFieldsFile} = $Options{datafieldsfile} ? $Options{datafieldsfile} : undef; 939 940 $OptionsInfo{RecordNum} = 0; $OptionsInfo{StartRecordNum} = 0; $OptionsInfo{EndRecordNum} = 0; 941 942 %{$OptionsInfo{RecordNums}} = (); 943 $OptionsInfo{RecordNumsMin} = 0; $OptionsInfo{RecordNumsMax} = 0; $OptionsInfo{RecordNumsCount} = 0; 944 945 $OptionsInfo{Record} = $Options{record} ? $Options{record} : undef; 946 947 if ($Options{mode} =~ /^(recordnum|recordnums|recordrange)$/i) { 948 if ($Options{record}) { 949 my($Record, @RecordSplit); 950 951 $Record = $Options{record}; 952 $Record =~ s/ //g; 953 954 @RecordSplit = split ",", $Record; 955 956 if ($Options{mode} =~ /^recordnum$/i ) { 957 if (@RecordSplit == 1) { 958 $OptionsInfo{RecordNum} = $RecordSplit[0]; 959 if ($OptionsInfo{RecordNum} <= 0) { 960 die "Error: The value specified, $OptionsInfo{RecordNum}, for option \"--records\" is not valid. Allowed values: > 0 \n"; 961 } 962 } 963 else { 964 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n"; 965 } 966 } 967 elsif ($Options{mode} =~ /^recordnums$/i ) { 968 my($RecordNum, $RecordCount, @SortedRecordSplit); 969 970 @SortedRecordSplit = sort { $a <=> $b } @RecordSplit; 971 972 $RecordCount = 0; 973 RECORDNUM: for $RecordNum (@SortedRecordSplit) { 974 if (exists $OptionsInfo{RecordNums}{$RecordNum}) { 975 next RECORDNUM; 976 } 977 $RecordCount++; 978 $OptionsInfo{RecordNums}{$RecordNum} = $RecordNum; 979 } 980 $OptionsInfo{RecordNumsCount} = $RecordCount; 981 $OptionsInfo{RecordNumsMin} = $SortedRecordSplit[0]; 982 $OptionsInfo{RecordNumsMax} = $SortedRecordSplit[$#SortedRecordSplit]; 983 } 984 else { 985 if (@RecordSplit == 2) { 986 $OptionsInfo{StartRecordNum} = $RecordSplit[0]; 987 $OptionsInfo{EndRecordNum} = $RecordSplit[1]; 988 if ($OptionsInfo{StartRecordNum} <= 0 || $OptionsInfo{EndRecordNum} <= 0) { 989 die "Error: The value pair specified, $Options{record}, for option \"--records\" is not valid. Allowed values: > 0 \n"; 990 } 991 } 992 else { 993 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n"; 994 } 995 if ($OptionsInfo{StartRecordNum} > $OptionsInfo{EndRecordNum}) { 996 die "Error: Start record number, $OptionsInfo{StartRecordNum}, must be smaller than end record number, $OptionsInfo{EndRecordNum}.\nSpecify different values using \"--record\" option.\n"; 997 } 998 } 999 } 1000 else { 1001 die "Error: For \"-m --mode\" option values recordnum, recordnums or recordrange, specify \"--record\" option value.\n"; 1002 } 1003 } 1004 1005 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); 1006 1007 my(@Words, $Line, $Value); 1008 if ($Options{mode} =~ /^datafields$/i) { 1009 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); 1010 if ($Options{datafields}) { 1011 @{$OptionsInfo{SpecifiedDataFieldLabels}} = split $OptionsInfo{InDelim}, $Options{datafields}; 1012 } 1013 elsif ($Options{datafieldsfile}) { 1014 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 1015 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 1016 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); 1017 if (@Words) { 1018 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, @Words; 1019 } 1020 } 1021 close DATAFIELDSFILE; 1022 } 1023 } 1024 elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) { 1025 my(@DataFieldsByValueTriplets); 1026 @DataFieldsByValueTriplets = (); 1027 if ($Options{datafields}) { 1028 @DataFieldsByValueTriplets = split $OptionsInfo{InDelim}, $Options{datafields}; 1029 } 1030 elsif ($Options{datafieldsfile}) { 1031 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 1032 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 1033 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); 1034 if (@Words) { 1035 push @DataFieldsByValueTriplets, @Words; 1036 } 1037 } 1038 close DATAFIELDSFILE; 1039 } 1040 if ((@DataFieldsByValueTriplets % 3)) { 1041 if ($Options{datafields}) { 1042 die "Error: Triplets not found in values specified by \"-d --datafields\" option\n"; 1043 } 1044 elsif ($Options{datafieldsfile}) { 1045 die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n"; 1046 } 1047 } 1048 my($Index, $Label, $Value, $Criterion); 1049 1050 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); 1051 %{$OptionsInfo{SpecifiedDataFieldValuesMap}} = (); 1052 %{$OptionsInfo{SpecifiedDataFieldCriteriaMap}} = (); 1053 1054 for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) { 1055 $Label = $DataFieldsByValueTriplets[$Index]; 1056 $Value = $DataFieldsByValueTriplets[$Index + 1]; 1057 $Criterion = $DataFieldsByValueTriplets[$Index + 2]; 1058 1059 if ($Criterion =~ /^(eq|le|ge)$/i) { 1060 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label; 1061 $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label} = $Value; 1062 $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label} = $Criterion; 1063 } 1064 else { 1065 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n"; 1066 } 1067 } 1068 } 1069 elsif ($Options{mode} =~ /^datafieldsbyregex$/i) { 1070 my(@DataFieldsByRegexTriplets); 1071 1072 @DataFieldsByRegexTriplets = (); 1073 if ($Options{datafields}) { 1074 @DataFieldsByRegexTriplets = quotewords($OptionsInfo{InDelim}, 0, $Options{datafields}); 1075 } 1076 elsif ($Options{datafieldsfile}) { 1077 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 1078 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 1079 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); 1080 if (@Words) { 1081 push @DataFieldsByRegexTriplets, @Words; 1082 } 1083 } 1084 close DATAFIELDSFILE; 1085 } 1086 if ((@DataFieldsByRegexTriplets % 3)) { 1087 if ($Options{datafields}) { 1088 die "Error: Triplet not found in values specified by \"-d --datafields\" option\n"; 1089 } 1090 elsif ($Options{datafieldsfile}) { 1091 die "Error: Triplet not found in values specified by \"--datafieldsfile\" option\n"; 1092 } 1093 } 1094 1095 my($Index, $Label, $Value, $Criterion); 1096 1097 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); 1098 %{$OptionsInfo{SpecifiedDataFieldRegexMap}} = (); 1099 %{$OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}} = (); 1100 1101 for ($Index = 0; $Index < @DataFieldsByRegexTriplets; $Index = $Index + 3) { 1102 $Label = $DataFieldsByRegexTriplets[$Index]; 1103 $Value = $DataFieldsByRegexTriplets[$Index + 1]; 1104 $Criterion = $DataFieldsByRegexTriplets[$Index + 2]; 1105 1106 if ($Criterion =~ /^(eq|ne)$/i) { 1107 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label; 1108 $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label} = $Value; 1109 $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label} = $Criterion; 1110 } 1111 else { 1112 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion; Supported values: eq or ne\n"; 1113 } 1114 } 1115 } 1116 elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) { 1117 my($Index, @DataFieldAndValuesList); 1118 if ($Options{datafields}) { 1119 @DataFieldAndValuesList = split $OptionsInfo{InDelim}, $Options{datafields}; 1120 } 1121 elsif ($Options{datafieldsfile}) { 1122 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 1123 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 1124 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); 1125 if (@Words) { 1126 push @DataFieldAndValuesList, @Words; 1127 } 1128 } 1129 close DATAFIELDSFILE; 1130 } 1131 if (@DataFieldAndValuesList < 2) { 1132 if ($Options{datafields}) { 1133 die "Error: Invalid number of values specified by \"-d --datafields\" option\n"; 1134 } 1135 elsif ($Options{datafieldsfile}) { 1136 die "Error: Invalid number values specified by \"--datafieldsfile\" option\n"; 1137 } 1138 } 1139 1140 $OptionsInfo{SpecifiedDataFieldLabel} = $DataFieldAndValuesList[0]; 1141 $OptionsInfo{SpecifiedDataFieldValuesCount} = @DataFieldAndValuesList - 1; 1142 %{$OptionsInfo{SpecifiedDataFieldValues}} = (); 1143 1144 for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) { 1145 $Value = $DataFieldAndValuesList[$Index]; 1146 $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound"; 1147 } 1148 } 1149 1150 $OptionsInfo{SDFileExt} = "sdf"; 1151 $OptionsInfo{TextFileExt} = "csv"; 1152 1153 if ($Options{outdelim} =~ /^tab$/i) { 1154 $OptionsInfo{TextFileExt} = "tsv"; 1155 } 1156 1157 if ($Options{mode} =~ /^(alldatafields|molnames)$/i) { 1158 $OptionsInfo{OutputSDFile} = 0; 1159 $OptionsInfo{OutputTextFile} = 1; 1160 } 1161 else { 1162 $OptionsInfo{OutputSDFile} = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0; 1163 $OptionsInfo{OutputTextFile} = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0; 1164 } 1165 1166 $OptionsInfo{StrDataString} = $Options{strdatastring}; 1167 $OptionsInfo{OutputStrDataString} = ($Options{strdatastring} =~ /^Yes$/i) ? 1 : 0; 1168 1169 $OptionsInfo{StrDataStringDelimiter} = $Options{strdatastringdelimiter}; 1170 1171 if (IsEmpty($Options{strdatastringdelimiter})) { 1172 die "Error: No value specified for \"--StrDataStringDelimiter\" option.\n"; 1173 } 1174 $OptionsInfo{StrDataStringMode} = $Options{strdatastringmode}; 1175 $OptionsInfo{StrDataStringWithFields} = $Options{strdatastringmode} =~ /^StrAndDataFields$/i ? 1 : 0; 1176 1177 MODE: { 1178 if ($Options{mode} =~ /^alldatafields$/i) { $OptionsInfo{FileNameMode} = "AllDataDields"; last MODE; } 1179 if ($Options{mode} =~ /^commondatafields$/i) { $OptionsInfo{FileNameMode} = "CommonDataDields"; last MODE; } 1180 if ($Options{mode} =~ /^datafields$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFields"; last MODE; } 1181 if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByValue"; last MODE; } 1182 if ($Options{mode} =~ /^datafieldsbyregex$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByRegex"; last MODE; } 1183 if ($Options{mode} =~ /^datafieldbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataField"; last MODE; } 1184 if ($Options{mode} =~ /^datafielduniquebylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedUniqueDataField"; last MODE; } 1185 if ($Options{mode} =~ /^datafieldnotbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldNotByList"; last MODE; } 1186 if ($Options{mode} =~ /^molnames$/i) { $OptionsInfo{FileNameMode} = "MolName"; last MODE; } 1187 if ($Options{mode} =~ /^randomcmpds$/i) { $OptionsInfo{FileNameMode} = "RandomCmpds"; last MODE; } 1188 if ($Options{mode} =~ /^recordnum$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{RecordNum}"; last MODE; } 1189 if ($Options{mode} =~ /^recordnums$/i) { $OptionsInfo{FileNameMode} = "RecordNums"; last MODE; } 1190 if ($Options{mode} =~ /^recordrange$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{StartRecordNum}" . "To" . "$OptionsInfo{EndRecordNum}"; last MODE; } 1191 if ($Options{mode} =~ /^2dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "2DCmpdRecords"; last MODE; } 1192 if ($Options{mode} =~ /^3dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "3DCmpdRecords"; last MODE; } 1193 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, , datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; 1194 } 1195 1196 } 1197 1198 # Setup script usage and retrieve command line arguments specified using various options... 1199 sub SetupScriptUsage { 1200 1201 # Retrieve all the options... 1202 %Options = (); 1203 $Options{numofcmpds} = 1; 1204 $Options{mode} = "alldatafields"; 1205 $Options{indelim} = "comma"; 1206 $Options{outdelim} = "comma"; 1207 $Options{output} = "SD"; 1208 $Options{quote} = "yes"; 1209 $Options{regexignorecase} = "yes"; 1210 $Options{valuecomparisonmode} = "numeric"; 1211 $Options{violations} = 0; 1212 $Options{seed} = 123456789; 1213 1214 $Options{strdatastring} = "no"; 1215 $Options{strdatastringdelimiter} = "|"; 1216 $Options{strdatastringmode} = "StrOnly"; 1217 1218 if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "regexignorecase=s", "record=s", "root|r=s", "seed|s=i", "strdatastring=s", "strdatastringdelimiter=s", "strdatastringmode=s", "valuecomparisonmode=s", "violations|v=i", "workingdir|w=s")) { 1219 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1220 } 1221 if ($Options{workingdir}) { 1222 if (! -d $Options{workingdir}) { 1223 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1224 } 1225 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1226 } 1227 if ($Options{numofcmpds} < 1) { 1228 die "Error: The value specified, $Options{numofcmpds}, for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n"; 1229 } 1230 if ($Options{valuecomparisonmode} !~ /^(Numeric|Alphanumeric)$/i) { 1231 die "Error: The value specified, $Options{valuecomparisonmode}, for option \"--ValueComparisonMode\" is not valid. Allowed values: Numeric or Alphanumeric\n"; 1232 } 1233 if ($Options{violations} < 0) { 1234 die "Error: The value specified, $Options{violations}, for option \"-v --violations\" is not valid. Allowed values: >= 0 \n"; 1235 } 1236 if ($Options{mode} !~ /^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldsbyregex|datafieldbylist|datafielduniquebylist|datafieldnotbylist|molnames|randomcmpds|recordnum|recordnums|recordrange|2dcmpdrecords|3dcmpdrecords)$/i) { 1237 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; 1238 } 1239 if ($Options{output} !~ /^(SD|text|both)$/i) { 1240 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n"; 1241 } 1242 if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) { 1243 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1244 } 1245 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 1246 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1247 } 1248 if ($Options{quote} !~ /^(yes|no)$/i) { 1249 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 1250 } 1251 if ($Options{regexignorecase} !~ /^(yes|no)$/i) { 1252 die "Error: The value specified, $Options{regexignorecase}, for option \"--regexignorecase\" is not valid. Allowed values: yes or no\n"; 1253 } 1254 if ($Options{strdatastring} !~ /^(yes|no)$/i) { 1255 die "Error: The value specified, $Options{strdatastring}, for option \"--StrDataString\" is not valid. Allowed values: yes or no\n"; 1256 } 1257 if ($Options{strdatastringmode} !~ /^(StrOnly|StrAndDataFields)$/i) { 1258 die "Error: The value specified, $Options{strdatastringmode}, for option \"--StrDataStringMode\" is not valid. Allowed values: StrOnly or StrAndDataFields\n"; 1259 } 1260 } 1261