Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Option for non HDF5 #12

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions scripts/preprocess.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
require 'pl'
require 'torch'

cmd = torch.CmdLine()
cmd:text()
cmd:text('Preprocess a text file for training a language model.')
cmd:option('--input_text', 'data/tiny-shakespeare.txt', 'Input text file')
-- split output into multiple files
cmd:option('--train_t7', 'data/train-tiny-shakespeare.t7',
'Output training data file in torch binary file')
cmd:option('--valid_t7', 'data/valid-tiny-shakespeare.t7',
'Output validating data file in torch binary file')
cmd:option('--test_t7', 'data/test-tiny-shakespeare.t7',
'Output testing data file in torch binary file')
cmd:option('--output_vocab', 'data/vocab-tiny-shakespeare.t7',
'Output vocab in torch binary file')
cmd:option('--val_frac', 0.1, 'Validation fraction')
cmd:option('--test_frac', 0.1, 'Testing fraction')
cmd:option('--quiet', false, 'Disable all verbose outputs')
cmd:text()
opt = cmd:parse(arg or {})


-- First pass collect statistics and build vocab
char2index = {}
char_count = 0
vocab_count = 0
f = io.open(opt.input_text)
while true do
line = f:read()
if not line then break end
for c in line:gmatch('.') do
if not char2index[c] then
vocab_count = vocab_count + 1
char2index[c] = vocab_count
end
char_count = char_count + 1
end
-- new line
char_count = char_count + 1
end
f:close()
-- XXX: hard code newline string
vocab_count = vocab_count + 1
char2index['\n'] = vocab_count
index2char = {}
-- create index to vocab map
for k, v in pairs(char2index) do index2char[v] = k end

-- compute split size
val_size = math.floor(opt.val_frac * char_count)
test_size = math.floor(opt.test_frac * char_count)
train_size = char_count - val_size - test_size

-- verbose
if not opt.quiet then
print('Total vocabulary size: ' .. #index2char)
print('Total tokens in file: ' .. char_count)
print(' Training size: ' .. train_size)
print(' Val size: ' .. val_size)
print(' Test size: ' .. test_size)
end

train = torch.IntTensor(train_size)
valid = torch.IntTensor(val_size)
test = torch.IntTensor(test_size)
dataset = {train, valid, test}

-- second pass reading data to Tensor
split_idx, cur_idx = 1, 1
f = io.open(opt.input_text)
while true do
line = f:read()
if not line then break end
-- XXX: Hard code new line
line = line .. '\n'
for c in line:gmatch('.') do
-- some split has 0 size
while dataset[split_idx]:size():size() == 0 do
split_idx = split_idx + 1
end
dataset[split_idx][cur_idx] = char2index[c]
cur_idx = cur_idx + 1
if cur_idx > dataset[split_idx]:size(1) then
split_idx = split_idx + 1
cur_idx = 1
end
end
end
f:close()
-- save to file
torch.save(opt.train_t7, train)
if val_size > 0 then torch.save(opt.valid_t7, valid) end
if test_size > 0 then torch.save(opt.test_t7, test) end
torch.save(opt.output_vocab, index2char)
24 changes: 19 additions & 5 deletions train.lua
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ require 'optim'

require 'LanguageModel'
require 'util.DataLoader'
require 'util.MiniBatchLoader'

local utils = require 'util.utils'

Expand All @@ -15,6 +16,12 @@ cmd:option('-input_h5', 'data/tiny-shakespeare.h5')
cmd:option('-input_json', 'data/tiny-shakespeare.json')
cmd:option('-batch_size', 50)
cmd:option('-seq_length', 50)
-- Optional: load dataset in t7
cmd:option('-format', 'h5')
cmd:option('-train_t7', 'data/train-tiny-shakespeare.t7')
cmd:option('-valid_t7', 'data/valid-tiny-shakespeare.t7')
cmd:option('-test_t7', 'data/test-tiny-shakespeare.t7')
cmd:option('-vocab_t7', 'data/vocab-tiny-shakespeare.t7')

-- Model options
cmd:option('-model_type', 'lstm')
Expand Down Expand Up @@ -44,7 +51,7 @@ cmd:option('-gpu', 0)
cmd:option('-gpu_backend', 'cuda')

local opt = cmd:parse(arg)

print(opt)

-- Set up GPU stuff
local dtype = 'torch.FloatTensor'
Expand All @@ -70,13 +77,20 @@ end


-- Initialize the DataLoader and vocabulary
local loader = DataLoader(opt)
local vocab = utils.read_json(opt.input_json)
local loader, vocab
local idx_to_token = {}
for k, v in pairs(vocab.idx_to_token) do
idx_to_token[tonumber(k)] = v
if opt.format == 't7' then
loader = MiniBatchLoader(opt)
idx_to_token = torch.load(opt.vocab_t7)
else
loader = DataLoader(opt)
vocab = utils.read_json(opt.input_json)
for k, v in pairs(vocab.idx_to_token) do
idx_to_token[tonumber(k)] = v
end
end


-- Initialize the model and criterion
local opt_clone = torch.deserialize(torch.serialize(opt))
opt_clone.idx_to_token = idx_to_token
Expand Down
71 changes: 71 additions & 0 deletions util/MiniBatchLoader.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
require 'torch'

local MiniBatchLoader = torch.class('MiniBatchLoader')

function MiniBatchLoader:__init(config)
config = config or {}
local args
args, self.train_file, self.valid_file, self.test_file,
self.batch_size, self.seq_length
= xlua.unpack(
{config},
'MiniBatchLoader',
'Load data files in torch binary format. Data will be cliped to fit mini batches',
{arg='train_t7', type='string', default='data/train-tiny-shakespeare.t7',
help='training data in torch binary (see script/preprocess.lua)'},
{arg='valid_t7', type='string', default='data/valid-tiny-shakespeare.t7',
help='training data in torch binary (see script/preprocess.lua)'},
{arg='test_t7', type='string', default='data/test-tiny-shakespeare.t7',
help='training data in torch binary (see script/preprocess.lua)'},
{arg='batch_size', type='number', default=8,
help='number of sequences to run for each mini batch'},
{arg='seq_length', type='number', default=6,
help='number of characters for each sequence'}
)
self.x_splits = {}
self.y_splits = {}
self.split_sizes = {}
local b, l = self.batch_size, self.seq_length
self.x_splits['train'], self.y_splits['train'] = self:loadData(self.train_file, b, l)
self.x_splits['val'], self.y_splits['val'] = self:loadData(self.valid_file, b, l)
self.x_splits['test'], self.y_splits['test'] = self:loadData(self.test_file, b, l)
self.split_sizes['train'] = self.x_splits['train']:size(1)
self.split_sizes['val'] = self.x_splits['val']:size(1)
self.split_sizes['test'] = self.x_splits['test']:size(1)
self.split_idxs = {train=1, val=1, test=1}
collectgarbage()
end

function MiniBatchLoader:loadData(file_path, b, l)
local tensor = torch.load(file_path)
local num = tensor:nElement()
local extra = num % (b * l)
-- Chop out the extra bits at the end to make it evenly divide
-- Each batch will have a continuous stream of data
local vx = tensor[{{1, num - extra}}]:view(b, -1, l)
local vy = tensor[{{2, num - extra + 1}}]:view(b, -1, l)
-- rearrage data so that the last two dimensions are B and L
-- XXX: This is not very efficient.
local vxx = torch.IntTensor(vx:size(2), vx:size(1), vx:size(3))
local vyy = torch.IntTensor(vy:size(2), vy:size(1), vy:size(3))
for i = 1, vyy:size(1) do
vyy[i] = vy[{{}, i, {}}]
vxx[i] = vx[{{}, i, {}}]
end
vxx:contiguous()
vyy:contiguous()
return vxx, vyy
end

function MiniBatchLoader:nextBatch(split)
local idx = self.split_idxs[split]
assert(idx, 'invalid split ' .. split)
local x = self.x_splits[split][idx]
local y = self.y_splits[split][idx]
if idx == self.split_sizes[split] then
self.split_idxs[split] = 1
else
self.split_idxs[split] = idx + 1
end
return x, y
end