-
Notifications
You must be signed in to change notification settings - Fork 1
/
random_subset
executable file
·71 lines (60 loc) · 1.39 KB
/
random_subset
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/perl
use Getopt::Long;
my $frac = undef;
my $count = undef;
my $line_number = undef;
my $seed = undef;
GetOptions("n|count=i" => \$count,
"s|seed=i" => \$seed,
"f|fraction=f" => \$frac,
"l|line-number!" => \$line_number)
or print_usage();
srand($seed) if defined $seed;
if (defined $frac) {
approx();
} else {
$count = 1 unless defined $count;
exact();
}
sub exact {
# If rand(x) did in fact produce numbers uniformly in [0, x), this
# algorithm would produce each of the binom(N, k) k-subsets of the
# N input lines with equal probability. Moreover, we don't need to
# know N in advance, and require only O(k) memory.
my @lines = ();
while (<>) {
if (@lines < $count) {
push @lines, [$., $_];
next;
}
$lines[rand($count)] = [$., $_] if rand($.) < $count;
}
@lines = sort {$a->[0] <=> $b->[0]} @lines;
print_lines(@lines);
}
sub approx {
my $line;
if (defined $count) {
while ($count && ($line = <>)) {
if (rand() < $frac) {
$count--;
print_lines([$., $line]);
}
}
} else {
while ($line = <>) {
print_lines([$., $line]) if (rand() < $frac);
}
}
}
sub print_lines {
if ($line_number) {
printf "%s\t%s", $_->[0], $_->[1] for (@_);
} else {
print $_->[1] for (@_);
}
}
sub print_usage {
print "$0 [-n COUNT] [-f FRACTION] [-l]\n";
exit(1);
}