-
Notifications
You must be signed in to change notification settings - Fork 1
/
apa_routines.pm
executable file
·3071 lines (2520 loc) · 118 KB
/
apa_routines.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env perl
# OLD: #!/n/local/stage/perlbrew/perlbrew-0.43/perls/perl-5.16.0/bin/perl
# OLDER: #!/n/site/inst/Linux-x86_64/sys/bin/perl
use File::Path qw/ make_path remove_tree /;
use Storable qw/ nstore retrieve dclone /;
use Scalar::Util qw(looks_like_number);
use Capture::Tiny ':all';
use Data::Dumper;
use File::Spec;
use strict;
use Roman;
chomp(my $tilde = `readlink -f ~`);
## Author: Ariel Paulson ([email protected])
## A Perl module of common custom functions
### SCRIPT SETUP/QC AND I/O FUNCTIONS:
## validate : validates files; creates and validates temp directories, with clobber/no-clobber handling
## create_writedir : -- temporarily deprecated, I think --
## test_writability : ensure location or prefix is writable
## is_compressed : returns array: ( 1|0 if properly compressed , compression error message , linux read command, linux write command )
## open2 : open() wrapper with handling for compressed files, stdio
## generic_io : opens basic input and output handles, tests validity of args, given $input, $output, and $stdio
## backticks_bash : backticks-substitute wired to use bash; allows commands with single quotes, unlike "bash -c '$cmd'"
## execute : system() wrapper that sends command to screen, optional logfile, and runs command
## logreport : send a message to screen & logfile simultaneously
## print_table : print array-of-arrays object to a filehandle using a delimiter, or sprintf-ed
## nfsAnalyze : temp directory not getting removed? Run this on it to see what files are locking and why (prints to STDERR)
## validate_genoanno : testing validity of $geno/$anno in /n/data1/genomes/indexes, also STAR indexes
## validate_bam : checks if a bam file is corrupted
### GENERIC FUNCTIONS:
## timestamp : three different types of platform-independent timestamps
## runtime : takes 2 timestamps and returns elapsed time, in various scales
## monthconv : convert month names to numbers and vice versa
## benchmark : platform-independent system resource stamps
## numjust : pad a digit with leading zeroes
## unique : return unique values from an array, in order of appearance
## transpose : transposes a matrix, stored in an array of arrays
## pairs2subgraphs : given a 2-level hash describing a connectivity matrix (e.g. $hash{$key1}{$key2}, preferably reciprocal so that $hash{$key2}{$key1} also always exists), return all subgraphs.
## validate_memory : compares requested RAM with server RAM; if too much either downsizes or dies
## canonicalize : "readlink -f" without the link-following
## extract_names_from_paths : port of the apa_tools.R function. Finds the smallest defining substrings from a list of paths.
### MATH FUNCTIONS:
## mean : mean of an array
## median : median of an array
## stdev : standard deviation of an array
## strip_NA : remove NA entries from array
### BIOLOGICAL FUNCTIONS:
## revcomp : reverse complement for DNA/RNA, with full degeneracy/masking support
## revcomp_regex : reverse complement a DNA/RNA regular expression, with full degeneracy/masking support
## word_exp : get the expected frequency of a DNA/RNA/AA word, given background character frequencies
## coding_potential : calculate coding potential for a DNA sequence, using one of several methods ########## NOT WORKING PROPERLY ##########
## translate : translate a DNA sequence
## blockify : convert a sequence to a fasta block
## chrsort : sort a list of chromosomes the way I prefer. Special handling for roman chrnums (yeast) and drosophila chrs
## get_memedata : get a standard hash of data from a MEME run (DOES NOT WORK FOR 4.8.1)
## UTR_weld : take a hash of CDS entries + hash of UTR entries and return a hash of exons
### UNDER CONSTRUCTION: may have some functionality at this time, or may not
## readFile : read a tabular text file into an array-of-arrays or hash-of-arrays
## writeFile : write an array, array-of-arrays, or hash-of-arrays back to a tabular text file
## readFasta : read a fasta file into a hash
## writeFasta : write a fasta file from a hash
## get_fimodata : get a standard hash of data from a FIMO run (DOES NOT WORK FOR 4.8.1)
## get_tomdata : get a standard hash of data from a Tomtom run (DOES NOT WORK FOR 4.8.1)
## length_regex : calculate length of a regular expression, also report if length is fixed or minimum
## vcf_orderheader : ensures VCF header is "properly" sorted; optional genome arg will ensure contig lines are correct and ordered.
## read_org_metadata : reads organism metadata file into a hash (for genome build prep)
## read_VARIABLES : reads genome/transcriptome prep _VARIABLES/_TVARIABLES file into a a hash
## A few key locations for GenomeIndexes stuff
my $indexes = '/n/data1/genomes/indexes';
my $apabin = $ENV{SIMR_APA_BIN};
my $buildroot = $ENV{SIMR_BUILDROOT};
sub validate {
## validates files
## creates and validates temp directories, with clobber/no-clobber handling.
## returns rooted path to input location
## INPUT is an array with 4 elements:
## 1. an object type ('file'|'dir')
## 2. an object path (e.g. "path/to/file")
## 3. an optional object label (e.g. "input file")
## 4. an optional binary argument (1|0) which is 'required' for files, and 'clobber' for dirs.
my ($type, $path, $label, $arg) = @_;
my ($fail, $error, $fullpath);
my $blurb = $label ? "$label '$path'" : "'$path'";
if ($type eq 'file') {
my $need = $arg;
if ($need) {
$fail .= "$0: $blurb does not exist!\n" unless -e $path; # required file
} else {
$fail .= "$0: $blurb does not exist!\n" if defined $path && ! -e $path; # not required, but specified and non-existent
}
} elsif ($type eq 'dir') {
my $clobber = $arg;
if ($path && $path ne '.') { ##### NEVER CLOBBER '.' !!!!!
if ($clobber) {
remove_tree($path, {error => \$error});
sleep 1;
$fail .= "$0: Cannot remove existing $blurb: $error\n" if -d $path;
} else {
$fail .= "$0: $blurb already exists!\n" if -d $path;
}
make_path($path, {error => \$error});
sleep 1;
if (!$error && ! -d $path) {
sleep 5;
make_path($path, {error => \$error}); # try once more, if unsuccessful
sleep 1;
}
$fail .= "$0: Failed to create $blurb: $error!\n" unless -d $path;
} else {
$path = '.';
}
}
if ($fail) {
die $fail;
} else {
# ROOT THE PATH BEFORE RETURNING
#$fullpath = File::Spec->rel2abs($path) if $path; # this module is trash; cannot return true full path, only "relative" full path
chomp($fullpath = `readlink -f $path`); # 'readlink' actually works (unless on Mac, but all my code is intended to be run serverside, i.e. CentOS)
return $fullpath;
}
}
sub validate_build {
## Given $geno and $anno (and perhaps a sub-build type), ensures the dataset exists in /n/data1/genomes/indexes
my ($geno, $anno, $sub) = @_;
my ($prefix, $valid);
my $genodir = "$indexes/$geno";
if ($sub) {
$prefix = $anno ? "$genodir/$anno/$sub/$geno.$anno.$sub" : "$genodir/$sub/$geno.$sub";
## sub-builds always have fastas
## annotation sub-builds: 'ribosomes', 'funcRNA'
## genome sub-builds: 'repeats'
chomp($valid = `ls $genodir/$anno/$sub/$geno.$anno.$sub.fa 2>/dev/null | wc -l`);
} else {
$prefix = "$genodir/$anno/$geno.$anno";
## main builds: check for genedata.txt
chomp($valid = `ls $genodir/$anno/$geno.$anno.genedata.txt 2>/dev/null | wc -l`);
}
if ($valid) {
return $prefix;
} elsif ($sub && $anno) {
die "$0: '$sub' sub-build for genome '$geno' annotation '$anno' was not found!\n";
} elsif ($sub) {
die "$0: '$sub' sub-build for genome '$geno' was not found!\n";
} else {
die "$0: annotation build '$anno' for genome '$geno' was not found!\n";
}
}
sub create_writedir {
## A special-case wrapper to &validate, when script has MEME-style output args:
## -o <outdir_don't_clobber> OR -oc <outdir_clobber>
## Resolves and creates an output directory, or dies
## EITHER $outdir_nocl (-o) OR $outdir_cl (-oc) are the output paths, but not both:
## IF BOTH $outdir_nocl AND $outdir_cl: dies.
## IF $outdir_nocl AND NOT $outdir_cl: any existing location ($outdir_nocl) WILL NOT be clobbered.
## IF $outdir_cl AND NOT $outdir_nocl: any existing location ($outdir_cl) WILL be clobbered.
## $label is for a friendly die message, if dying is warranted
## $default is an OPTIONAL default value for $outdir
## returns fully rooted path to output location.
## example: $outdir = &create_writedir($o, $oc, 'output location', '.');
my ($outdir_nocl, $outdir_cl, $label, $default) = @_;
my ($outdir, $clobber);
if ($outdir_nocl && $outdir_cl) {
die "$0: Cannot specify both -o and -oc!\n";
} elsif ($outdir_cl) {
$outdir = $outdir_cl;
$clobber = 1;
} elsif ($outdir_nocl) {
$outdir = $outdir_nocl;
$clobber = 0;
} elsif ($default) {
$outdir = $default;
$clobber = 0; # NEVER CLOBBER A DEFAULT LOCATION: MAY BE '.'
} else {
die "$0: Failed to specify output directory, and no default!\n";
}
$label = 'output location' unless $label;
my $outdir2 = &validate('dir', $outdir, $label, $clobber);
return $outdir2;
}
sub test_writability {
## Input a path or pathed filename prefix
## Can either return 0|1 for writability, or just die if unwritable
my ($location, $die) = @_;
my $testfile = -d $location ? "$location/test" : "$location.test";
my $testpath = $location;
$testpath =~ s/[^\/]+$// unless -d $testpath;
system "touch $testfile";
sleep 1;
my $pass = -e $testfile ? 1 : 0;
system "rm -f $testfile";
die "$0: Output location '$testpath' is not writable!\n" if !$pass && $die;
return $pass;
}
sub is_compressed {
## INPUT: 1. filename, 2. OPTIONAL 1|0 indicating to use bgzip for write command
## OUTPUT: array: ( is-compressed flag , error message , read command, write command )
## recognizes gzip/bgzip/bzip2/xz/zip compression, also plain text and tar
## NOTE: read and write commands ALREADY CONTAIN PIPES, so just use like:
## open(IN, $read_command)
## open(OUT, $write_command)
my ($file, $bgzip_flag) = @_;
my $is_comp; # 1|0 is file compressed (or will it be compressed)
my $comp_err; # error message, if any
my @iocmd; # (read command, write command)
my $ext_type; # compression type according to file extension
my $file_type; # actual compression type (according to 'file' command)
(my $fext = "\L$file") =~ s/^.*\.//; # file extension, ensure lowercase
(my $unfile = $file) =~ s/\.$fext$//i; # $file without the compression suffix (unused if no compression suffix)
## Extension -> IO read type lookup
my @ordtypes = qw/ text bgzip gzip bzip2 zip xz /; # for test-running purposes: ensure bgzip is tested BEFORE gzip
my %iotypes = ('xz'=>'xz', 'gz'=>'gzip', 'tgz'=>'gzip', 'bgz'=>'bgzip', 'bz2'=>'bzip2', 'zip'=>'zip'); # '.gz' could also be bgzip, but this is corrected for elsewhere
$ext_type = $iotypes{$fext};
## Data for recognized compression types.
## Also includes the uncompressed type, WHICH ASSUMES FORMAT IS PLAIN TEXT.
## Elements = 1. expected 'file' output regexp, 2. integrity test command, 3. read command, 4. write command.
my $testx = "$file 2>&1 | grep -P \"\S\"";
my %typedata = (
'text' => { 'REGEX'=>' text$', 'TEST'=>"", 'READ'=>"cat $file |", 'WRITE'=>"| cat > $file" },
'xz' => { 'REGEX'=>'^xz compressed', 'TEST'=>"bash -c 'xz -t $testx'", 'READ'=>"unxz -q -c $file |", 'WRITE'=>"| xz -q -f > $file" },
'gzip' => { 'REGEX'=>'^gzip compressed', 'TEST'=>"bash -c 'gzip -t $testx'", 'READ'=>"gunzip -c $file |", 'WRITE'=>"| gzip -f > $file" },
'bgzip' => { 'REGEX'=>'^gzip compressed, extra field', 'TEST'=>"bash -c 'gzip -t $testx'", 'READ'=>"gunzip -c $file |", 'WRITE'=>"| bgzip -f > $file" }, # no 'bgunzip', neither is there 'bgzip -t'
'bzip2' => { 'REGEX'=>'^bzip2 compressed', 'TEST'=>"bash -c 'bzip2 -t $testx'", 'READ'=>"bunzip2 -c $file |", 'WRITE'=>"| bzip2 -f > $file" },
'zip' => { 'REGEX'=>'^Zip archive', 'TEST'=>"bash -c 'zip -T $testx'", 'READ'=>"unzip -p $file |", 'WRITE'=>"| cat > $unfile && rm -f $file && zip $file $unfile && rm -f $unfile" }
);
## 'zip' write command is crazy because zip is designed to archive existing files, NOT write from a pipe.
## It CAN write from a pipe, e.g. 'echo "blah" | zip test.zip -', but if you unzip 'test.zip', the filename is '-' not 'test'.
## Thus, command is: 1. write to non-zip temp, 2. destroy existing zip (otherwise it appends), 3. zip temp, 4. destroy temp.
## Yes, it actually works.
## As written above, commands are heavily justified to make them clearer.
## Here, collapse all that whitespace before actually running any of them!
## Not that leaving it in would break anything, but if errors are thrown, it could make reading the message more confusing...
foreach my $type (keys %typedata) {
$typedata{$type}{$_} =~ s/ +/ /g foreach keys %{ $typedata{$type} };
}
## Determine $file_type
if (-e $file) {
## See what 'file' says about the format of $file
chomp(my $filemsg = `file $file | cut -f2 -d: | sed 's/^ //'`);
foreach my $type (@ordtypes) {
## MUST be ordered! Test bgzip BEFORE gzip!!
if ($filemsg =~ /$typedata{$type}{REGEX}/) {
$file_type = $type;
@iocmd = ($typedata{$type}{READ}, $typedata{$type}{WRITE});
last;
}
}
## Compare $file_type and $ext_type
if ($file_type eq 'text') {
## some kind of text file (UTF-8, ASCII, etc)
## no issues, unless it looks like it SHOULD be compressed...
$comp_err = "WARNING: filename looks compressed, but data is not!" if $ext_type;
} elsif ($file_type) {
## then data is compressed
$is_comp = 1;
## test archive integrity
chomp($comp_err = `$typedata{$file_type}{TEST}`);
$comp_err = undef if $file_type eq 'zip' && $comp_err eq "test of $file OK";
if ($comp_err) {
## corrupted archive
@iocmd = (); # do not pretend we can read corrupted files
} else {
## intact archive
if ($ext_type eq 'gzip' && $file_type eq 'bgzip') {
## OK; this is an unavoidable issue caused by bgzip's non-unique file extension
} elsif ($ext_type eq $file_type) {
## OK; this is the expected scenario
} else {
## mismatched intentions?
$comp_err = "WARNING: actual compression format '$file_type' is not compatible with file extension '$fext'!\n";
}
}
} elsif ($filemsg eq 'data') {
## unknown file format: no handling
## no error either, unless it looks like it SHOULD be compressed...
$comp_err = "Compression format not recognized by 'file'! Archive is probably corrupted." if $ext_type;
} else {
## some other kind of file; no handling or error
}
} else {
## FILE DOES NOT EXIST YET
## no errors; have to believe $ext_type since no $file_type
if ($ext_type) {
## file will be compressed if $ext_type is valued
$is_comp = 1;
} else {
## file will NOT be compressed
$ext_type = 'text';
}
@iocmd = ($typedata{$ext_type}{READ}, $typedata{$ext_type}{WRITE});
$iocmd[1] = $typedata{bgzip}{WRITE} if $ext_type eq 'gzip' && $bgzip_flag; # switch write command, if requested
}
## Return: 1|0 if valid compressed, error if compression problem, linux commands to read/write file
return [$is_comp, $comp_err, @iocmd];
}
sub open2 {
## open() wrapper with handling for compressed files, stdio
## DOES NOT SUPPORT bidirectional file access, e.g. '+<', 'r+'
## OUTPUT: a filehandle
## INPUT: array with 3-4 elements:
## 1. mode: one of R, W, A (read, write, append), or a &open descriptor (like '-|', '>', etc)
## 2. file: a filename, or a command if opening a pipe, or undef. Most compressed files are recognized and handled.
## 3. label: an optional file label.
## 4. bgzip: OPTIONAL, 1|0 value. Read/Write compressed using 'bgzip' instead of 'gz'
## NOTE: 'append' mode not compatible with compressed files
## NOTE: 'bgzip' is ignored if no file is given
## handles gzip, bgzip, bzip2, and zip compression formats
my ($mode, $file, $label, $bgzip) = @_;
$label = $label ? "$label '$file'" : "'$file'"; # upgrade $label
my %rwamodes = ('r'=>'<', 'w'=>'>', 'a'=>'>>');
$mode = $rwamodes{"\L$mode"} if $mode =~ /^[rwa]$/i;
my ($exists, $iscomp, $comperr, $readcmd, $writecmd);
if ($file) {
$exists = -e $file;
($iscomp, $comperr, $readcmd, $writecmd) = @{ &is_compressed($file, $bgzip) };
}
my ($IN, $OUT);
if ($mode eq '-' || $mode eq '<-') {
## STDIN
if ($file) {
die "$0: do not specify a file when requesting STDIN access!\n";
} else {
open($IN, $mode) or die "$0: cannot open STDIN: $!\n";
}
} elsif ($mode eq '>-') {
## STDOUT
if ($file) {
die "$0: do not specify a file when requesting STDOUT access!\n";
} else {
open($OUT, $mode) or die "$0: cannot open STDOUT: $!\n";
}
} elsif ($mode eq '-|') {
## pipe in ($file should be a command)
if ($file) {
open($IN, $mode, $file) or die "$0: cannot open pipe from $label: $!\n";
} else {
die "$0: must specify a command when requesting pipe access!\n";
}
} elsif ($mode eq '|-') {
## pipe out ($file should be a command)
if ($file) {
open($OUT, $mode, $file) or die "$0: cannot open pipe to $label: $!\n";
} else {
die "$0: must specify a command when requesting pipe access!\n";
}
} elsif ($mode eq '<') {
## read (file)
if ($file) {
if ($iscomp) {
open($IN, $readcmd) or die "$0: problems executing '$readcmd': $!\n";
} else {
open($IN, $mode, $file) or die "$0: cannot read from file $label: $!\n";
}
} else {
die "$0: must specify a file when requesting a '$mode' handle!\n";
}
} elsif ($mode eq '>' || $mode eq '>>') {
## write or append (file)
my $blurb = $mode eq '>' ? 'write' : 'append';
if ($file) {
if ($iscomp) {
die "$0: cannot append to compressed files! Please use another method.\n" if $mode eq '>>';
open($OUT, $writecmd) or die "$0: problems executing '$writecmd': $!\n";
} else {
open($OUT, $mode, $file) or die "$0: cannot $blurb to file $label: $!\n";
}
} else {
die "$0: must specify a file when requesting a '$mode' handle!\n";
}
} else {
die "$0: 'open2' access mode value '$mode' not supported!\n";
}
if ($IN) {
return $IN;
} elsif ($OUT) {
return $OUT;
}
}
sub generic_io {
## designed for simplistic situations with very direct, single-input -> single-output workflows, e.g. pipes
## opens (and returns) input and output handles
## resolves any conflict between $input and $stdio
## chooses output mode based on $output
## ensures handles are openable
##
## usage: "my ($IN, $OUT) = &generic_io($input, $output, $stdio);", where args are from GetOptions, @ARGV, etc
##
## NOTE: be sure that print-to-screen messages in your script go to STDERR, since $output may be STDOUT and thus captured elsewhere.
my ($INPUT, $OUTPUT, $STDIO) = @_;
my ($IN, $OUT);
if ($INPUT) {
die "$0: do not specify both '-' and '-i'!\n" if $STDIO;
$IN = &open2('R', $INPUT, 'input VCF');
} elsif ($STDIO) {
$IN = *STDIN;
} else {
die "$0: no input specified!\n";
}
if ($OUTPUT) {
$OUT = &open2('W', $OUTPUT, 'output VCF');
} else {
$OUT = *STDOUT;
}
return ($IN, $OUT);
}
sub backticks_bash {
## thanks to http://www.perlmonks.org/?node_id=691255
## need to integrate this? $stderr = tee_stderr \&code;
my ($cmd, $login) = @_;
my $pipe;
if ($login) {
open($pipe, '-|', '/bin/bash', '--login -c', $cmd) or return;
} else {
open($pipe, '-|', '/bin/bash', '-c', $cmd) or return;
}
local $/ = wantarray ? $/ : undef;
<$pipe>
}
sub execute {
## system() wrapper that sends command to screen, optional logfile, and runs command
## INPUT: array with 4 elements:
## 1. command to run
## 2. visualize command on std-what? (1=out,2=err)
## 3. optional: path to logfile, or open filehandle
## 4. optional: capture messages from std-what? (1=out,2=err,3=both) # BE SURE THIS DOES NOT CONFLICT WITH COMMAND OUTPUT
## 5. optional: 1|0; execute with 'bash -c'
my ($cmd, $std, $fh, $cap, $bash) = @_;
chomp $cmd; # just in case
if ($std == 1) {
print STDOUT "$cmd\n";
} elsif ($std == 2) {
print STDERR "$cmd\n";
}
$cmd =~ s/^\n?RUNNING: //; # goofy way to allow "headerization" of commands; later versions will be more formal
if ($fh) {
if (defined fileno($fh)) {
## filehandle
print $fh "$cmd\n";
} else {
## file
open my $LOG, '>>', $fh or die "$0: Failed to append to log file '$fh': $!\n";
print $LOG "$cmd\n";
close $LOG;
}
}
my $cmd2_sh = sub { system($cmd) };
my $cmd2_bash = sub { system("bash -c '$cmd'") };
my $cmd2 = $bash ? $cmd2_bash : $cmd2_sh;
my $msg;
if ($bash && $cmd =~ /'/) {
$msg = &backticks_bash($cmd);
} else {
if ($cap == 1) {
$msg = $std ? tee_stdout \&$cmd2 : capture_stdout \&$cmd2;
} elsif ($cap == 2) {
$msg = $std ? tee_stderr \&$cmd2 : capture_stderr \&$cmd2;
} elsif ($cap == 3) {
$msg = $std ? tee_merged \&$cmd2 : capture_merged \&$cmd2;
} else {
&$cmd2;
}
}
if ($msg =~ /\S/) {
if ($fh) {
if (defined fileno($fh)) {
## filehandle
print $fh "$msg\n";
} else {
## file
open my $LOG, '>>', $fh or die "$0: Failed to append to log file '$fh': $!\n";
print $LOG "$msg\n";
close $LOG;
}
}
}
}
sub logreport {
## prints a message to STDOUT and to a given logfile
my ($msg, $out, $fh, $nochomp) = @_;
chomp $msg unless $nochomp; # so you don't have to remember if it needs newlines or not
if (defined fileno($fh)) {
## $fh is a handle
print $fh "$msg\n";
} else {
## $fh is a logfile
open my $LOG, '>>', $fh or die "$0: Failed to append to log file '$fh': $!\n";
print $LOG "$msg\n";
close $LOG;
}
if ($out == 1) {
print STDOUT "$msg\n";
} elsif ($out == 2) {
print STDERR "$msg\n";
}
}
sub print_table {
## print array-of-arrays object to a filehandle using a delimiter, or sprintf-ed
## input: 1=reference to array-of-arrays, 2=1|0 use sprintf, 3=filehandle, 4=delimiter string
## writes directly to filehandle
## only understands three column datatypes: string, integer and float
## strings columns are left-justified, numeric columns are right-justified
## float formatting specifies number of digits to the right of decimal by the first float value seen per column
## - thus float values should be sprintf-ed BEFORE being stored in the object, to avoid formatting inconsistencies
my ($ref, $do_sprintf, $fh, $delim) = @_;
$fh = *STDOUT unless $fh;
my @AOA = @$ref;
my @rows = (0..$#AOA);
my @cols = (0..$#{ $AOA[0] });
if ($do_sprintf) {
$delim = ' ' unless defined $delim; # default sprintf delimiter: two spaces
my $fmt;
## first pass: get column widths and numeric/non status
## construct format on the fly
foreach my $col (@cols) {
my ($width, $right);
my $numeric = 1; # initial assumption
my $float = 0; # initial assumption
foreach my $row (@rows) {
my $w = length($AOA[$row][$col]);
$width = $w if $w > $width;
next if $row == 0; # don't format-test the header row
next unless $numeric; # still have breakable assumption(s)
if ($AOA[$row][$col] =~ /[^0-9.-]/) {
$numeric = $float = 0; # column is not numeric
} elsif ($AOA[$row][$col] =~ /\./) {
$float = 1; # irrelevant unless $numeric
$right = length( (split /\./, $AOA[$row][$col])[-1] );
}
}
$fmt .= '%'.($numeric?'':'-').$width.($numeric?($float?".${right}f":'i'):'s');
$fmt .= $col == $#cols ? "\n" : $delim;
}
(my $hfmt = $fmt) =~ s/(i|\.\d+f)(\s)/s$2/g; # header format: convert numeric 'i' or '.#f' to string 's'; preserve following space OR newline
$hfmt =~ s/\%([^-])/%-$1/g; # ensure headers all left-justify
## second pass: print
print $fh sprintf($hfmt, @{ $AOA[0] });
foreach my $row (1..$#rows) {
print $fh sprintf($fmt, @{ $AOA[$row] });
}
} else {
$delim = "\t" unless defined $delim; # default non-sprintf delimiter: tab
foreach my $row (@rows) {
print $fh join($delim, @{ $AOA[$row] }), "\n";
}
}
}
sub nfsAnalyze {
## reports on undeletable files (i.e. files that, when deleted, result in undeletable .nfs files).
## for troubleshooting temp directories that cannot be removed by the script.
my $dir = shift;
my @files = glob "$dir/*";
my $maxw;
foreach (@files) {
$maxw = length($_) if length($_) > $maxw;
}
my $fmt = 'FILE: %-'.$maxw.'s | %s'."\n";
my %already;
print STDERR "\n\n.NFS FILE ANALYSIS\nTHIS PID: $$\nCONTENTS OF '$dir':\n";
foreach my $file (@files) {
my $fuser = `fuser $file 2>/dev/null`;
$fuser =~ s/\s//g;
my $msg = $fuser ? "IN USE: $fuser" : 'not in use';
system "rm -f $file";
my @nfs = glob "$dir/.nfs*";
foreach my $nfs (@nfs) {
next if $already{$nfs}; # some previous file's .nfs file
$already{$nfs} = 1; # this is a new .nfs file
$msg .= " | NOT DELETABLE: $nfs";
}
print STDERR sprintf($fmt, $file, $msg);
}
print "\n\n";
}
sub validate_genoanno {
my ($GENO, $ANNO, $STARLEN) = @_;
my $Gdir = "$indexes/$GENO";
my $Gpref = "$indexes/$GENO/$GENO";
my $Adir = "$indexes/$GENO/$ANNO";
my $Apref = "$indexes/$GENO/$ANNO/$GENO.$ANNO";
die "$0: Expected dataset path '$Gdir' does not exist!\n" unless -d $Gdir;
die "$0: Expected dataset path '$Adir' does not exist!\n" if $ANNO && ! -d $Adir;
my @out = ($Gdir, $Gpref, $Adir, $Apref);
if ($STARLEN) {
## if $STARLEN (read length), then must be an $ANNO STAR idx, because $geno idx has no indicated read length
my $Sdir = "$indexes/$GENO/$ANNO/STAR_${STARLEN}bp";
die "$0: Expected STAR index '$Sdir' does not exist!\n" if $STARLEN && ! -d $Sdir;
push @out, $Sdir;
}
return @out;
}
sub validate_bam {
## Returns non-null if the bam file is corrupted
my $BAM = shift; # /path/to/file.bam
my $TMP = "validate_bam.$$.tmp";
system "samtools view $BAM 2> $TMP | head > /dev/null";
chomp(my $ERR = `cat $TMP`);
system "rm -f $TMP";
return $ERR;
}
sub timestamp {
## returns date, time, or date+time timestamps
my $timetype = shift; # FULL = date + time | DATE = date | TIME = time
my $timestamp;
my %daynames = (0,'Sun', 1,'Mon', 2,'Tue', 3,'Wed', 4,'Thu', 5,'Fri', 6,'Sat');
my %monthnames = (0,'Jan', 1,'Feb', 2,'Mar', 3,'Apr', 4,'May', 5,'Jun', 6,'Jul', 7,'Aug', 8,'Sep', 9,'Oct', 10,'Nov', 11,'Dec');
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime();
if ($timetype eq 'FULL') {
$timestamp = "$daynames{$wday} $monthnames{$mon} ".sprintf("%02d %4d %02d:%02d:%02d", $mday,$year+1900,$hour,$min,$sec);
} elsif ($timetype eq 'DATE') {
$timestamp = "$daynames{$wday} $monthnames{$mon} ".sprintf("%02d %4d", $mday,$year+1900);
} elsif ($timetype eq 'TIME') {
$timestamp = sprintf("%02d:%02d:%02d", $hour,$min,$sec);
}
return $timestamp;
}
sub runtime { # UNDER CONSTRUCTION
## takes 2 and returns the difference, in the specified timescale
## mode 1 = 'FULL' timestamp from above timestamp() function, e.g. "Fri May 4 2012 14:00:47"
## mode 2 = generic [YYYY-MM-DD HH:MM:SS] timestamp, e.g. "2012-05-04 14:00:47"
## mode 3 = Unix `date` style timestamp, e.g. "Fri May 4 14:00:47 CDT 2012"
### requires 2 timestamp('FULL') entries
my (%timestamps, %timediff, %timeparts, %absolute, $scale, $mode);
($timestamps{1}, $timestamps{2}, $scale, $mode) = @_; # timestamps with format e.g. 'Thu Jun 23 2011 12:57:08'; for $scale see %scales below
my %scales = map {($_=>1)} qw/ FULL YEAR MONTH DAY HOUR MINUTE SECOND /; # time scale to return results in; 'FULL' = full breakout
# note that scale 'MONTH' will return month breakdown as months elapsed + fraction of end-month elapsed
my %monthdays = (1,31, 2,28, 3,31, 4,30, 5,31, 6,30, 7,31, 8,31, 9,30, 10,31, 11,30, 12,31);
my %monthsecs = map {($_=>$monthdays{$_}*86400)} keys %monthdays;
my %monthnums = (1,'Jan', 2,'Feb', 3,'Mar', 4,'Apr', 5,'May', 6,'Jun', 7,'Jul', 8,'Aug', 9,'Sep', 10,'Oct', 11,'Nov', 12,'Dec');
my %timescales = ('YEAR',31536000, 'DAY',86400, 'HOUR',3600, 'MIN',60, 'SEC',1); # times in seconds | months variable, so not here
foreach my $t (1,2) {
my ($dname, $mname, $mon, $date, $yr, $zone, $hr, $min, $sec);
if ($mode == 1) {
($dname, $mon, $date, $yr, $hr, $min, $sec) = split /\s+/, $timestamps{$t};
$mon = monthconv($mname,2);
} elsif ($mode == 2) {
($yr, $mon, $date, $hr, $min, $sec) = split /[\s:-]+/, $timestamps{$t};
} elsif ($mode == 3) {
($dname, $mname, $date, $hr, $min, $sec, $zone, $yr) = split /[\s:]+/, $timestamps{$t};
}
$timeparts{$t}{YEAR} = $yr;
$timeparts{$t}{MONTH} = $mon;
$timeparts{$t}{DAY} = $date;
$timeparts{$t}{HOUR} = $hr;
$timeparts{$t}{MIN} = $min;
$timeparts{$t}{SEC} = $sec;
$absolute{$t} = ($yr-1)*31536000 + ($date-1)*86400 + $hr*3600 + $min*60 + $sec; # fully elapsed years, days only
$absolute{$t} += $monthsecs{$_} foreach 1..$timeparts{$t}{MONTH}; # add seconds per day of each fully elapsed month in YTD
}
my $seconds = $absolute{2} - $absolute{1}; # total seconds elapsed
if ($scale eq 'FULL' || $scale eq 'YEAR') {
if ($seconds > $timescales{YEAR}) {
$timediff{YEAR} = $seconds/$timescales{YEAR};
$seconds %= $timescales{YEAR};
}
}
if ($scale eq 'FULL' || $scale eq 'MONTH') {
my $thismonth = $timeparts{1}{MONTH};
if ($seconds > $monthsecs{$thismonth}) { # months begin elapsing from the start-time month
my $continue = 1;
while ($continue) {
$timediff{MONTH}++;
$seconds -= $monthsecs{$thismonth};
$thismonth++;
$continue = 0 if $seconds < $monthsecs{$thismonth}; # stop if fewer seconds exist than do in the next month
}
}
}
if ($scale eq 'FULL' || $scale eq 'DAY') {
if ($seconds > $timescales{DAY}) {
$timediff{DAY} = $seconds/$timescales{DAY};
$seconds %= $timescales{DAY};
}
}
if ($scale eq 'FULL' || $scale eq 'HOUR') {
# if ($seconds > $timescales{HOUR}) {
$timediff{HOUR} = $seconds/$timescales{HOUR};
$seconds %= $timescales{HOUR};
# }
}
if ($scale eq 'FULL' || $scale eq 'MIN') {
if ($seconds > $timescales{MIN}) {
$timediff{MIN} = $seconds/$timescales{MIN};
$seconds %= $timescales{MIN};
}
}
if ($scale eq 'FULL') {
my @elapsed;
push @elapsed, int($timediff{YEAR}), " years" if $timediff{YEAR};
push @elapsed, int($timediff{MONTH}), " months" if $timediff{MONTH};
push @elapsed, int($timediff{DAY}), " days" if $timediff{DAY};
push @elapsed, int($timediff{HOUR}), " hours" if $timediff{HOUR};
push @elapsed, int($timediff{MIN}), " minutes" if $timediff{MIN};
push @elapsed, int($timediff{SEC}), " seconds" if $timediff{SEC};
my $elapsed = join ', ', @elapsed;
return "Elapsed: $elapsed\n";
} else {
return $timediff{$scale};
}
}
sub monthconv {
## converting month names to numbers or vice versa
my ($inmonth, $dir) = @_; # $dir = 1 for num -> name | $dir = 2 for name -> num
my %monthnames1 = (1,'Jan', 2,'Feb', 3,'Mar', 4,'Apr', 5,'May', 6,'Jun', 7,'Jul', 8,'Aug', 9,'Sep', 10,'Oct', 11,'Nov', 12,'Dec');
my %monthnames2 = reverse %monthnames1;
if ($dir == 1) {
$inmonth =~ s/^0+//; # drop any leading zeroes
return $monthnames1{$inmonth};
} elsif ($dir == 2) {
my $outmonth = $monthnames2{$inmonth};
$outmonth = "0$outmonth" if $outmonth < 10;
return $outmonth;
} else {
die "Conversion direction '$dir' must be 1 (num->name) or 2 (name->num)!\n";
}
}
sub benchmark {
## captures 'top' info on a process (Linux -OR- Windows) and returns it, annotated (or writes it to file)
my ($pid, $platform, $file, $msg, $script) = @_; # $msg, $script, $file can be null
# print "Received benchmark request: '$pid', '$msg', '$script', '$platform', '$file'\n";
my $string;
if ($platform eq 'LINUX') {
my @output = split /\n/, `top -b -p $pid -n 1`;
my @stuff = split /\s+/, $output[7];
$string = join "\t", ($msg, $script, @stuff);
} elsif ($platform eq 'WIN2K') {
my @output = split /\n/, `tasklist /V /FI "PID eq $pid"`;
my @stuff = split /\s+/, $output[3];
$string = join "\t", ($msg, $script, @stuff);
} else {
$string = "\nBenchmark subroutine doesn't know what to do on $platform operating systems!\nNo RAM usage data will be available.";
warn $string;
}
if ($file) {
open OUT, ">> $file" or warn "\n&benchmark cannot append to $file: $!\n";
print OUT "$string\n";
close OUT;
} else {
return $string;
}
}
sub revcomp {
## reverse-complement DNA/RNA with full degeneracy/masking support
my $SEQ = shift;
$SEQ = $$SEQ if $SEQ =~ /SCALAR/; # convert references
($SEQ = reverse $SEQ) =~ tr/ACGTURYSWKMHDVBNacgturyswkmhdvbn/TGCAAYRSWMKDHBVNtgcaayrswmkdhbvn/;
return \$SEQ; # return reference
}
sub revcomp_regex {
## reverse-complement DNA/RNA regular expressions with full degeneracy/masking support
## DOES NOT SUPPORT ALL POSSIBLE REGULAR EXPRESSIONS. Does not use regexp parse trees. Designed for revcomping simple DNA/RNA motif regexps.
## Basic and non-greedy quantifiers, custom classes, and simple groupings are supported, or at least as far as I have tested.
## Anchors, escaped chars and class shortcuts, lookarounds, backreferences, logicals, no-memory groupings, and nested groupings are NOT supported.
my $SEQ = shift;
return('') if $SEQ =~ /\(\?[<!=:]/; # no lookarounds or no-memory groupings
return('') if $SEQ =~ /(?<!\\)\\(?!\\)/; # no backrefs or escaped chars
return('') if $SEQ =~ /\([^\(]*\(/; # no nested groupings
return('') if $SEQ =~ /^\^/ || $SEQ =~ /(?<!\\)\$$/; # no anchors
my $nonparen = '[^\(\)]+';
my $nonbrack = '[^\[\]]+';
my $nonenc = '[\\\]?[^\(\)\[\]]'; # aiming to capture escaped single chars, too -- but in general, escaped things not supported
my $paren = '\('.$nonparen.'\)';
my $brack = '\['.$nonbrack.'\]';
my $quants = '(?<![\\\])[\*\+\?]{1,2}'; # NON-ESCAPED quantifiers -- but in general, escaped things not supported
my $braces = '\{[\d,]+\}';
my $anytarg = $paren.'|'.$brack.'|'.$nonenc;
$SEQ = $$SEQ if $SEQ =~ /SCALAR/; # convert references
($SEQ = reverse $SEQ) =~ tr/ACGTURYSWKMHDVBNacgturyswkmhdvbn\[\]\(\)\{\}/TGCAAYRSWMKDHBVNtgcaayrswmkdhbvn\]\[\)\(\}\{/;
$SEQ =~ s/\?([+*])($anytarg)/$2$1?/g; # un-reverse minimal quantifiers
$SEQ =~ s/($quants|$braces)($anytarg)/$2$1/g; # put quantifiers on the right side of targets
$SEQ =~ s/\[($nonbrack)\^\]/[^$1]/g; # put class negators on the left inside of brackets
$SEQ =~ s/\{(\d*)(,?)(\d+)?\}/{$3$2$1}/g; # un-reverse quantifier order inside braces
return \$SEQ; # return reference
}
sub length_regex {