This repository has been archived by the owner on Jan 13, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 183
/
data-stats.sh
60 lines (47 loc) · 1.62 KB
/
data-stats.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/bin/env bash
# This script takes a training and test set of a task, and outputs statistics
# of the following form. Note that this script counts each *story* as an
# example, not each *question*.
# Task 4
# Overlap: 112 overlapping unique examples (12.0% of test set)
#
# # Examples # Unique examples
# Train: 1000 934 (6.6% duplicates)
# Test: 1000 933 (6.7% duplicates)
# Check that the user passed 2 files
if [ $# -ne 2 ]
then
echo "Usage: $0 train test"
exit 1
fi
args=("$@")
# Outputs a single example per line
format-examples() {
{ cut -f 1,2 |
sed 's/^1 /\\1 /g' | tr -d '\n' | sed 's/\\/\n/g' | sed '/^$/d'; } < "$1"
}
# Collect file statistics
for i in $(seq 0 1)
do
formatted[$i]=$(format-examples "${args[$i]}")
unique[$i]=$({ sort | uniq ; } <<< "${formatted[$i]}")
num_examples[$i]=$(wc -l <<< "${formatted[$i]}")
num_unique_examples[$i]=$(wc -l <<< "${unique[$i]}")
done
# Find overlap
num_overlap=$(comm -12 <(echo "${unique[0]}") <(echo "${unique[1]}") | wc -l)
# Print statistics
template="Overlap: %u overlapping unique examples (%.1f%% of test set)
# Examples # Unique examples"
overlap_pct=$(bc -l <<< "$num_overlap/${num_unique_examples[1]}*100")
arguments="$num_overlap $overlap_pct"
names=(Train Test)
for i in $(seq 0 1)
do
template="$template
$(printf "%5s" "${names[$i]}"): %-12u %u (%.1f%% duplicates)"
non_unique=$((${num_examples[$i]}-${num_unique_examples[$i]}))
pct=$(bc -l <<< "$non_unique/${num_examples[$i]}*100")
arguments="$arguments ${num_examples[$i]} ${num_unique_examples[$i]} $pct"
done
printf "$template\n" $arguments