jcjohnson · northanapon · Feb 18, 2016 · Feb 18, 2016 · Feb 18, 2016 · Feb 18, 2016
diff --git a/scripts/preprocess.lua b/scripts/preprocess.lua
@@ -0,0 +1,95 @@
+require 'pl'
+require 'torch'
+
+cmd = torch.CmdLine()
+cmd:text()
+cmd:text('Preprocess a text file for training a language model.')
+cmd:option('--input_text', 'data/tiny-shakespeare.txt', 'Input text file')
+-- split output into multiple files
+cmd:option('--train_t7', 'data/train-tiny-shakespeare.t7',
+           'Output training data file in torch binary file')
+cmd:option('--valid_t7', 'data/valid-tiny-shakespeare.t7',
+           'Output validating data file in torch binary file')
+cmd:option('--test_t7', 'data/test-tiny-shakespeare.t7',
+           'Output testing data file in torch binary file')
+cmd:option('--output_vocab', 'data/vocab-tiny-shakespeare.t7',
+           'Output vocab in torch binary file')
+cmd:option('--val_frac', 0.1, 'Validation fraction')
+cmd:option('--test_frac', 0.1, 'Testing fraction')
+cmd:option('--quiet', false, 'Disable all verbose outputs')
+cmd:text()
+opt = cmd:parse(arg or {})
+
+
+-- First pass collect statistics and build vocab
+char2index = {}
+char_count = 0
+vocab_count = 0
+f = io.open(opt.input_text)
+while true do
+    line = f:read()
+    if not line then break end
+    for c in line:gmatch('.') do
+        if not char2index[c] then
+            vocab_count = vocab_count + 1
+            char2index[c] = vocab_count
+        end
+        char_count = char_count + 1
+    end
+    -- new line
+    char_count = char_count + 1
+end
+f:close()
+-- XXX: hard code newline string
+vocab_count = vocab_count + 1
+char2index['\n'] = vocab_count
+index2char = {}
+-- create index to vocab map
+for k, v in pairs(char2index) do index2char[v] = k end
+
+-- compute split size
+val_size = math.floor(opt.val_frac * char_count)
+test_size = math.floor(opt.test_frac * char_count)
+train_size = char_count - val_size - test_size
+
+-- verbose
+if not opt.quiet then
+    print('Total vocabulary size: ' .. #index2char)
+    print('Total tokens in file: ' .. char_count)
+    print('  Training size: ' .. train_size)
+    print('  Val size: ' .. val_size)
+    print('  Test size: ' .. test_size)
+end
+
+train = torch.IntTensor(train_size)
+valid = torch.IntTensor(val_size)
+test = torch.IntTensor(test_size)
+dataset = {train, valid, test}
+
+-- second pass reading data to Tensor
+split_idx, cur_idx = 1, 1
+f = io.open(opt.input_text)
+while true do
+    line = f:read()
+    if not line then break end
+    -- XXX: Hard code new line
+    line = line .. '\n'
+    for c in line:gmatch('.') do
+        -- some split has 0 size
+        while dataset[split_idx]:size():size() == 0 do
+            split_idx = split_idx + 1
+        end
+        dataset[split_idx][cur_idx] = char2index[c]
+        cur_idx = cur_idx + 1
+        if cur_idx > dataset[split_idx]:size(1) then
+            split_idx = split_idx + 1
+            cur_idx = 1
+        end
+    end
+end
+f:close()
+-- save to file
+torch.save(opt.train_t7, train)
+if val_size > 0 then torch.save(opt.valid_t7, valid) end
+if test_size > 0 then torch.save(opt.test_t7, test) end
+torch.save(opt.output_vocab, index2char)
diff --git a/train.lua b/train.lua
@@ -4,6 +4,7 @@ require 'optim'
 
 require 'LanguageModel'
 require 'util.DataLoader'
+require 'util.MiniBatchLoader'
 
 local utils = require 'util.utils'
 
@@ -15,6 +16,12 @@ cmd:option('-input_h5', 'data/tiny-shakespeare.h5')
 cmd:option('-input_json', 'data/tiny-shakespeare.json')
 cmd:option('-batch_size', 50)
 cmd:option('-seq_length', 50)
+-- Optional: load dataset in t7
+cmd:option('-format', 'h5')
+cmd:option('-train_t7', 'data/train-tiny-shakespeare.t7')
+cmd:option('-valid_t7', 'data/valid-tiny-shakespeare.t7')
+cmd:option('-test_t7', 'data/test-tiny-shakespeare.t7')
+cmd:option('-vocab_t7', 'data/vocab-tiny-shakespeare.t7')
 
 -- Model options
 cmd:option('-model_type', 'lstm')
@@ -44,7 +51,7 @@ cmd:option('-gpu', 0)
 cmd:option('-gpu_backend', 'cuda')
 
 local opt = cmd:parse(arg)
-
+print(opt)
 
 -- Set up GPU stuff
 local dtype = 'torch.FloatTensor'
@@ -70,13 +77,20 @@ end
 
 
 -- Initialize the DataLoader and vocabulary
-local loader = DataLoader(opt)
-local vocab = utils.read_json(opt.input_json)
+local loader, vocab
 local idx_to_token = {}
-for k, v in pairs(vocab.idx_to_token) do
-  idx_to_token[tonumber(k)] = v
+if opt.format == 't7' then
+  loader = MiniBatchLoader(opt)
+  idx_to_token = torch.load(opt.vocab_t7)
+else
+  loader = DataLoader(opt)
+  vocab = utils.read_json(opt.input_json)
+  for k, v in pairs(vocab.idx_to_token) do
+    idx_to_token[tonumber(k)] = v
+  end
 end
 
+
 -- Initialize the model and criterion
 local opt_clone = torch.deserialize(torch.serialize(opt))
 opt_clone.idx_to_token = idx_to_token

diff --git a/util/MiniBatchLoader.lua b/util/MiniBatchLoader.lua
@@ -0,0 +1,71 @@
+require 'torch'
+
+local MiniBatchLoader = torch.class('MiniBatchLoader')
+
+function MiniBatchLoader:__init(config)
+    config = config or {}
+    local args
+    args, self.train_file, self.valid_file, self.test_file,
+          self.batch_size, self.seq_length
+        = xlua.unpack(
+        {config},
+        'MiniBatchLoader',
+        'Load data files in torch binary format. Data will be cliped to fit mini batches',
+        {arg='train_t7', type='string', default='data/train-tiny-shakespeare.t7',
+         help='training data in torch binary (see script/preprocess.lua)'},
+        {arg='valid_t7', type='string', default='data/valid-tiny-shakespeare.t7',
+         help='training data in torch binary (see script/preprocess.lua)'},
+        {arg='test_t7', type='string', default='data/test-tiny-shakespeare.t7',
+         help='training data in torch binary (see script/preprocess.lua)'},
+        {arg='batch_size', type='number', default=8,
+         help='number of sequences to run for each mini batch'},
+        {arg='seq_length', type='number', default=6,
+         help='number of characters for each sequence'}
+    )
+    self.x_splits = {}
+    self.y_splits = {}
+    self.split_sizes = {}
+    local b, l = self.batch_size, self.seq_length
+    self.x_splits['train'], self.y_splits['train'] = self:loadData(self.train_file, b, l)
+    self.x_splits['val'], self.y_splits['val'] = self:loadData(self.valid_file, b, l)
+    self.x_splits['test'], self.y_splits['test'] = self:loadData(self.test_file, b, l)
+    self.split_sizes['train'] = self.x_splits['train']:size(1)
+    self.split_sizes['val'] = self.x_splits['val']:size(1)
+    self.split_sizes['test'] = self.x_splits['test']:size(1)
+    self.split_idxs = {train=1, val=1, test=1}
+    collectgarbage()
+end
+
+function MiniBatchLoader:loadData(file_path, b, l)
+    local tensor = torch.load(file_path)
+    local num = tensor:nElement()
+    local extra = num % (b * l)
+    -- Chop out the extra bits at the end to make it evenly divide
+    -- Each batch will have a continuous stream of data
+    local vx = tensor[{{1, num - extra}}]:view(b, -1, l)
+    local vy = tensor[{{2, num - extra + 1}}]:view(b, -1, l)
+    -- rearrage data so that the last two dimensions are B and L
+    -- XXX: This is not very efficient.
+    local vxx = torch.IntTensor(vx:size(2), vx:size(1), vx:size(3))
+    local vyy = torch.IntTensor(vy:size(2), vy:size(1), vy:size(3))
+    for i = 1, vyy:size(1) do
+        vyy[i] = vy[{{}, i, {}}]
+        vxx[i] = vx[{{}, i, {}}]
+    end
+    vxx:contiguous()
+    vyy:contiguous()
+    return vxx, vyy
+end
+
+function MiniBatchLoader:nextBatch(split)
+  local idx = self.split_idxs[split]
+  assert(idx, 'invalid split ' .. split)
+  local x = self.x_splits[split][idx]
+  local y = self.y_splits[split][idx]
+  if idx == self.split_sizes[split] then
+    self.split_idxs[split] = 1
+  else
+    self.split_idxs[split] = idx + 1
+  end
+  return x, y
+end