forked from nshepperd/gpt-2
-
Notifications
You must be signed in to change notification settings - Fork 1
/
encode.py
executable file
·31 lines (25 loc) · 1.3 KB
/
encode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!/usr/bin/env python3
# Usage:
# PYTHONPATH=src ./encode.py <file|directory|glob> /path/to/output.npz
# PYTHONPATH=src ./train --dataset /path/to/output.npz
import argparse
import numpy as np
import encoder
from load_dataset import load_dataset
parser = argparse.ArgumentParser(
description='Pre-encode text files into tokenized training set.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--model_name', metavar='MODEL', type=str, default='117M', help='Pretrained model name')
parser.add_argument('--combine', metavar='CHARS', type=int, default=50000, help='Concatenate files with <|endoftext|> separator into chunks of this minimum size')
parser.add_argument('--encoding', type=str, default='utf-8', help='Set the encoding for reading and writing files.')
parser.add_argument('in_text', metavar='PATH', type=str, help='Input file, directory, or glob pattern (utf-8 text).')
parser.add_argument('out_npz', metavar='OUT.npz', type=str, help='Output file path')
def main():
args = parser.parse_args()
enc = encoder.get_encoder(args.model_name)
print('Reading files')
chunks = load_dataset(enc, args.in_text, args.combine, encoding=args.encoding)
print('Writing', args.out_npz)
np.savez_compressed(args.out_npz, *chunks)
if __name__ == '__main__':
main()