forked from Villemoes/rvutils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
split_col
executable file
·358 lines (297 loc) · 9.9 KB
/
split_col
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use File::Basename;
use Errno qw(EMFILE ENFILE);
# --column
# --template
### --substr
# --header
# TODO:
#
# * Add option to remove the selected column from the output.
#
# * Add option to sanitize the column content, e.g. replace / by _,
# maybe also handle other filename-unfriendly chars.
my $template = undef;
my $prefix = undef;
my $suffix = undef;
my $template_substr = '{}';
my $header = 0;
my $column = 1;
my $delim = "\t";
my $headerline = undef;
my $help = 0;
my $current_template;
my $current_infile;
my $current_fields;
sub err_exit {
my $fmt = shift;
my $txt = sprintf($fmt, @_);
$txt .= "\n" unless $txt =~ m/\n$/;
printf STDERR $txt;
exit(1);
}
sub print_usage {
err_exit("usage: %s [--template=a{}b] [--column=<n>] files", $0);
}
sub print_help {
print <<HELP;
$0 - split a file using column contents
Options:
--column n Use column n (counting 1-based), default 1
--header Treat the first line of each input file as a header line.
That line will be copied verbatim to each output file.
--template str Use str as the template for output files names. str
should contain at least one replacement string (see
below) - this will be replaced by the contents of
the appropriate column.
If --template is not given, the output from an input file foo.bar.ext
will use the template foo.bar_{}.ext. If the input filename contains
no ., the implicitly used template is filename_{}.
The only supported column delimiter is tab (\\t).
For example, suppose foo contains these lines:
horse 4
spider 8
cow 4
octopus 8
human 2
millipede 1000
Running
$0 --column 2 foo
will produce four files, foo_2, foo_4, foo_8, foo_1000, containing 1,
2, 2, and 1 lines, respectively. Had the option "--template
'animals_with_{}_legs'" been given, the output files would have had
more descriptive names.
Sanitizing the column contents or the resulting file names is not
attempted. It is obviously dangerous if the column may contain
slashes, so this should be avoided, as should any character which one
wouldn't want to end up in a filename.
Use - to read from stdin (or just omit trailing file names). When
reading from stdin, the filename implicitly used for generating output
file names is STDIN, so it is recommended to use the --template option
in that case.
Apart from {}, there are a few other replacement strings available:
{.} Current input file without extension:
foo.jpg becomes foo
subdir/foo.jpg becomes subdir/foo
sub.dir/foo.jpg becomes sub.dir/foo
sub.dir/bar remains sub.dir/bar
{..} Extension of current input file, including the dot. If the
current input file does not have an extension, this is empty. In
the above four examples, we would have ".jpg", ".jpg", ".jpg",
and "", respectively.
{.}{..} is always equal to the current input file name.
{n} Column number n. One can split based on multiple columns by simply
using multiple occurrences of these (and then the --column option
is redundant). For example
split_col --template 'foo{}bar' --column 4
is equivalent to
split_col --template 'foo{4}bar'
{} is simply a synonym for {c}, where c is the value of the
--column option (and which defaults to 1).
The default template described above is actually {.}_{}{..}
HELP
exit(0);
}
GetOptions("template=s" => \$template,
"header!" => \$header,
"column=i" => \$column,
"help|h" => \$help)
or print_usage();
if ($help) {
print_help();
}
err_exit("column index must be positive") if ($column <= 0);
# $column is given 1-based, but we use 0-based indexing internally
$column--;
$template = '{.}_{}{..}' unless defined $template;
err_exit("at least one replacement string ({} or {n}) must occur in the template")
if (!($template =~ m/\{(?:[1-9][0-9]*)?\}/));
sub compute_current_template {
my ($base, $ext);
$current_template = $template;
if ($current_infile =~ m#^(.*)(\.[^/]*)$#) {
$base = $1;
$ext = $2;
} else {
$base = $current_infile;
$ext = '';
}
$current_template =~ s/\Q{.}\E/$base/g;
$current_template =~ s/\Q{..}\E/$ext/g;
}
my %outfiles = ();
my $listhead = {};
$listhead->{next} = $listhead->{prev} = $listhead;
my $wopen_count = 0; # number of files open for writing (or appending)
my $wopen_max = 1000; # Never exceed this number.
sub get_replacement {
my ($repstring) = @_;
my $c;
# Repstring is either undef (for {}) or a positive integer (for {n}, for some n).
$c = (defined $repstring) ? $repstring - 1 : $column;
exists $current_fields->[$c]
or die "not enough columns on line $. of $current_infile";
return $current_fields->[$c];
}
sub compute_outfn {
my $ret = $current_template;
$ret =~ s/\{([1-9][0-9]*)?\}/get_replacement($1)/ge;
return $ret;
}
sub do_input_file {
my $infile = shift;
my $h = $infile eq '-' ? \*STDIN : open_read($infile);
$current_infile = ($infile eq '-') ? 'STDIN' : $infile;
my ($line, $outfn, $outhandle);
if ($header) {
$headerline = <$h>;
}
compute_current_template();
while (<$h>) {
# Save a copy of the text and then chomp $_. We don't want the
# terminating newline to show up in $x if we are splitting
# based on the last column.
$line = $_;
chomp;
$current_fields = [split /\Q$delim\E/o];
$outfn = compute_outfn();
$outhandle = get_output_handle($outfn);
print $outhandle $line;
}
close($h);
}
# We work around EMFILE/ENFILE as follows:
#
# Files open for output are tracked in %outfiles. For a given filename
# $fn, $outfiles{$fn} is a ref to a hash with three members: next, prev, handle.
#
# ->{handle} is, if defined, an file handle opened for writing to
# $fn. The two other fields are used for maintaining a LRU list of
# open file handles; the use of this is explained below.
#
# When we read a line of input and figure out what file $fn that line
# should go to, we first try to look up $fn in %outfiles. If it exists
# and ->{handle} is defined, we move $outfiles{$fn} to the front of
# the LRU list and use the handle directly.
#
# If $outfiles{$fn} exists but ->{handle} is not defined, it means
# that the file has at some point been opened, but we needed to close
# it to free up a file descriptor. In that case, we will open the file
# again, but in append mode so as not to overwrite the already written
# contents.
#
# If $outfiles{$fn} does not exist, we have never encountered $fn
# before. So open $fn for writing and insert an appropriate anon href
# in %outfiles.
#
# In either case, the just (re)opened file is inserted in the front of
# the list of open files.
#
# Now, if (re)opening a file for writing (appending) fails, and errno
# (aka $! in perl) is ENFILE or EMFILE, we may be able to solve the
# problem by closing an open file and trying again. We always close
# the least recently used file, since real input has a tendency to be
# somewhat sorted. We keep trying to close file handles as long as we
# have any file open for output (in the case of ENFILE, some other
# process may have successfully opened a file after we closed one but
# before we got around to trying again). If we run out of files to
# close, or if we encounter an error other than E[MN]FILE, we die with
# an error message.
#
sub list_insert_head {
my $href = shift;
die unless (!defined $href->{prev} && !defined $href->{next});
$href->{next} = $listhead->{next};
$href->{next}->{prev} = $href;
$href->{prev} = $listhead;
$listhead->{next} = $href;
}
sub list_remove {
my $href = shift;
die if $href == $listhead;
$href->{prev}->{next} = $href->{next};
$href->{next}->{prev} = $href->{prev};
$href->{prev} = $href->{next} = undef;
}
sub list_move_to_front {
my $href = shift;
return if $listhead->{next} == $href;
list_remove($href);
list_insert_head($href);
}
sub list_remove_last {
my $href = $listhead->{prev};
return undef if $href == $listhead;
list_remove($href);
return $href;
}
sub get_output_handle {
my $fn = shift;
my $href = exists $outfiles{$fn} ? $outfiles{$fn} : undef;
if (defined $href && defined $href->{handle}) {
list_move_to_front($href);
return $href->{handle}
}
if (defined $href) {
$href->{handle} = open_append($fn);
}
else {
$href = {prev => undef, next => undef, handle => open_write($fn)};
$outfiles{$fn} = $href;
}
list_insert_head($href);
$wopen_count++;
return $href->{handle};
}
sub open_read {
my $fn = shift;
return open_common($fn, '<', 'reading');
}
sub open_write {
my $fn = shift;
my $h = open_common($fn, '>', 'writing');
print $h $headerline if defined $headerline;
return $h;
}
sub open_append {
my $fn = shift;
return open_common($fn, '>>', 'appending');
}
sub open_common {
my ($fn, $mode, $verb) = @_;
my ($h, $r);
close_lru_file() while ($wopen_count >= $wopen_max);
while (!($r = open($h, $mode, $fn)) && ($! == EMFILE || $! == ENFILE)) {
last if $wopen_count == 0;
close_lru_files($! == EMFILE ? $wopen_count - 1 : int($wopen_count/2));
}
if (!$r) {
die "unable to open ${fn} for ${verb}: $!";
}
return $h;
}
sub close_lru_file {
if ($wopen_count <= 0) {
die "BUG! ", (caller(0))[3], "() should not be called when wopen_count == $wopen_count";
}
my $href = list_remove_last();
if (!defined $href) {
die "BUG! ", (caller(0))[3], "() called while no files appear to be open for writing";
}
close($href->{handle}) or die "closing file handle failed: $!";
$href->{handle} = undef;
$wopen_count--;
}
sub close_lru_files {
my $target = shift;
close_lru_file() while ($wopen_count > $target);
}
push @ARGV, '-' if (@ARGV == 0);
for my $inp (@ARGV) {
do_input_file($inp);
}
# Close all output files. We reuse close_lru_files() so that errors are reported.
close_lru_files(0);