From 4a43f0d748b860888e40c76bf10a8c320d15725d Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Thu, 3 Sep 2015 08:42:14 +0800
Subject: [PATCH 01/15] merging the code, not compiled

---
 dmlc-core            |  2 +-
 include/mxnet/io.h   | 16 ++++++++
 src/common/utils.h   |  5 +++
 src/io/inst_vector.h | 92 ++++++++++++++++++++------------------------
 src/io/io.cc         |  9 ++++-
 src/io/iter_mnist.cc |  4 +-
 6 files changed, 73 insertions(+), 55 deletions(-)
diff --git a/dmlc-core b/dmlc-core
index db6ec995f148..7d3c78428819 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit db6ec995f148e1922da40fc53d23ed4fb583056f
+Subproject commit 7d3c78428819dc84c4da8ae1f302ba6c6a235a5d
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 47a59eec54fe..5a8267befc1c 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -109,5 +109,21 @@ struct DataIteratorReg
   }                                                                     \
   DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name) \
   .set_body(__create__ ## DataIteratorType ## __)
+/*!
+ * \brief Macro to register chained Iterators
+ *
+ * \code
+ * // example of registering a imagerec iterator
+ * MXNET_REGISTER_IO_CHAINED_ITERATOR(ImageRec, ImageRecordIter, BatchIter)
+ * .describe("batched image record data iterator");
+ *
+ * \endcode
+ */
+#define MXNET_REGISTER_IO_CHAINED_ITER(name, ChainedDataIterType, HoldingDataIterType)          \
+  static ::mxnet::IIterator<DataBatch>* __create__ ## ChainedDataIteratorType ## __() { \
+    return new HoldingDataIteratorType(new ChainedDataIterType);                                    \
+  }                                                                     \
+  DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name) \
+  .set_body(__create__ ## ChainedDataIteratorType ## __)
 }  // namespace mxnet
 #endif  // MXNET_IO_H_
diff --git a/src/common/utils.h b/src/common/utils.h
index cf1fd2f1bb36..f7a2dcce0470 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -22,6 +22,11 @@ namespace common {
  */
 typedef std::mt19937 RANDOM_ENGINE;
 
+// Get a double float, prnd is the pointer to a Random Engine
+#define NextDouble(prnd) std::generate_canonical<float, 10>(*prnd) 
+
+#define NextUInt32(range, prnd) static_cast<uint32_t>(\
+        floor(std::generate_canonical<float, 10>(*prnd) * range))
 /*!
  * \brief Helper functions.
  */
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
index 1ae734631680..9490ceab94c1 100644
--- a/src/io/inst_vector.h
+++ b/src/io/inst_vector.h
@@ -1,17 +1,19 @@
 /*!
- *  Copyright (c) 2015 by Contributors
- * \inst_vector.h
+ * \file inst_vector.h
  * \brief holder of a sequence of DataInst in CPU
  *        that are not necessarily of same shape
  */
-#ifndef MXNET_IO_INST_VECTOR_H_
-#define MXNET_IO_INST_VECTOR_H_
+
+#ifndef MXNET_INST_VECTOR_H_
+#define MXNET_INST_VECTOR_H_
+
+#include "./data.h"
+#include <vector>
 #include <dmlc/base.h>
 #include <mshadow/tensor.h>
-#include <vector>
-#include <string>
-#include "./data.h"
+
 namespace mxnet {
+namespace io {
 /*!
  * \brief tensor vector that can store sequence of tensor
  *  in a memory compact way, tensors do not have to be of same shape
@@ -28,7 +30,7 @@ class TensorVector {
     CHECK(i + 1 < offset_.size());
     CHECK(shape_[i].Size() == offset_[i + 1] - offset_[i]);
     return mshadow::Tensor<cpu, dim, DType>
-        (reinterpret_cast<DType*>(BeginPtr(content_)) + offset_[i], shape_[i]);
+        ((DType*)BeginPtr(content_) + offset_[i], shape_[i]);
   }
   inline mshadow::Tensor<cpu, dim, DType> Back() const {
     return (*this)[Size() - 1];
@@ -49,7 +51,6 @@ class TensorVector {
     content_.clear();
     shape_.clear();
   }
-
  private:
   // offset of the data content
   std::vector<size_t> offset_;
@@ -59,59 +60,48 @@ class TensorVector {
   std::vector<mshadow::Shape<dim> > shape_;
 };
 
-/*!
- * \brief tblob vector that can store sequence of tblob
- *  in a memory compact way, tblobs do not have to be of same shape
- */
-template<typename DType>
-class TBlobVector {
- public:
-  TBlobVector(void) {
-    this->Clear();
-  }
-  // get i-th tblob
-  inline TBlob operator[](size_t i) const;
-  // get the last tblob
-  inline TBlob Back();
-  // return the size of the vector
-  inline size_t Size(void) const;
-  // push a tensor of certain shape
-  // return the reference of the pushed tensor
-  inline void Push(TShape shape_);
-  inline void Clear(void);
- private:
-  // offset of the data content
-  std::vector<size_t> offset_;
-  // data content
-  std::vector<DType> content_;
-  // shape of data
-  std::vector<TShape > shape_;
-};
-
 /*!
  * \brief instance vector that can holds
  * non-uniform shape data instance in a shape efficient way
  */
 class InstVector {
- public:
+ public:  
   inline size_t Size(void) const {
     return index_.size();
   }
   // instance
-  inline DataInst operator[](size_t i) const;
+  inline DataInst operator[](size_t i) const {
+    DataInst inst;
+    inst.index = index_[i];
+    inst.data = data_[i];
+    inst.label = label_[i];
+    return inst;
+  }
   // get back of instance vector
-  inline DataInst Back() const;
-  // clear the container
-  inline void Clear(void);
-  // push the newly coming instance
-  inline void Push(unsigned index, TBlob data_);
-
- private:
+  inline DataInst Back() const {
+    return (*this)[Size() - 1];
+  }
+  inline void Clear(void) {
+    index_.clear();
+    data_.Clear();
+    label_.Clear();
+  }
+  inline void Push(unsigned index,
+                   mshadow::Shape<3> dshape,
+                   mshadow::Shape<1> lshape) {
+    index_.push_back(index);
+    data_.Push(dshape);
+    label_.Push(lshape);
+  }
+  
+ private:  
   /*! \brief index of the data */
   std::vector<unsigned> index_;
+  // label
+  TensorVector<3, real_t> data_;
   // data
-  std::vector<TensorVector<real_t> > data_;
-  // extra data
-  std::vector<std::string> extra_data_;
+  TensorVector<1, real_t> label_;
 };
-#endif  // MXNET_IO_INST_VECTOR_H_
+}  // namespace io
+}  // namespace mxnet
+#endif  // MXNET_TENSOR_VECTOR_H_
diff --git a/src/io/io.cc b/src/io/io.cc
index bd5b78dda643..9095f4089c92 100644
--- a/src/io/io.cc
+++ b/src/io/io.cc
@@ -4,7 +4,14 @@
 
 #include <mxnet/io.h>
 #include <dmlc/registry.h>
+#include <image_augmenter.h>
+#include <>
+#include <iter_batch.h>
 
+// Registers
 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::mxnet::DataIteratorReg);
-}  // namespace dmlc
+// Register parameters in header files
+DMLC_REGISTER_PARAMETER(BatchParam);
+DMLC_REGISTER_PARAMETER(ImageAugmenterParam);
+}  // namespace dmlc
\ No newline at end of file
diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc
index 93195061b278..77ac3a479f75 100644
--- a/src/io/iter_mnist.cc
+++ b/src/io/iter_mnist.cc
@@ -31,7 +31,7 @@ struct MNISTParam : public dmlc::Parameter<MNISTParam> {
   bool flat;
   /*! \brief random seed */
   int seed;
-  // declare parameters in header file
+  // declare parameters
   DMLC_DECLARE_PARAMETER(MNISTParam) {
     DMLC_DECLARE_FIELD(image).set_default("./train-images-idx3-ubyte")
         .describe("Mnist image path.");
@@ -155,7 +155,7 @@ class MNISTIter: public IIterator<DataBatch> {
     delete stdlabel;
   }
   inline void Shuffle(void) {
-    std::shuffle(inst_.begin(), inst_.end(), common::RANDOM_ENGINE(kRandMagic+param_.seed));
+    std::shuffle(inst_.begin(), inst_.end(), common::RANDOM_ENGINE(kRandMagic + param_.seed));
     std::vector<float> tmplabel(labels_.size());
     mshadow::TensorContainer<cpu, 3> tmpimg(img_.shape_);
     for (size_t i = 0; i < inst_.size(); ++i) {

From 2251812d5dd693a608453c3b8faa2a2962568c28 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Thu, 3 Sep 2015 08:42:50 +0800
Subject: [PATCH 02/15] add image rec and associate files in

---
 src/io/image_augmenter.h      | 262 ++++++++++++++++++++++++
 src/io/image_recordio.h       |  75 +++++++
 src/io/iter_batch.h           | 162 +++++++++++++++
 src/io/iter_image_recordio.cc | 369 ++++++++++++++++++++++++++++++++++
 src/utils/decoder.h           | 128 ++++++++++++
 src/utils/io.h                | 175 ++++++++++++++++
 src/utils/thread_buffer.h     | 205 +++++++++++++++++++
 7 files changed, 1376 insertions(+)
 create mode 100644 src/io/image_augmenter.h
 create mode 100644 src/io/image_recordio.h
 create mode 100644 src/io/iter_batch.h
 create mode 100644 src/io/iter_image_recordio.cc
 create mode 100644 src/utils/decoder.h
 create mode 100644 src/utils/io.h
 create mode 100644 src/utils/thread_buffer.h

diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
new file mode 100644
index 000000000000..d33464c4a889
--- /dev/null
+++ b/src/io/image_augmenter.h
@@ -0,0 +1,262 @@
+/*!
+ * \file image_augmenter_opencv.hpp
+ * \brief threaded version of page iterator
+ * \author Naiyan Wang, Tianqi Chen
+ */
+#ifndef MXNET_IO_IMAGE_AUGMENTER_H_
+#define MXNET_IO_IMAGE_AUGMENTER_H_
+
+#include <opencv2/opencv.hpp>
+#include "../common/utils.h"
+
+namespace mxnet {
+namespace io {
+/*! \brief image augmentation parameters*/
+struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
+  /*! \brief whether we do random cropping */
+  bool rand_crop_;
+  /*! \brief whether we do nonrandom croping */
+  int crop_y_start_;
+  /*! \brief whether we do nonrandom croping */
+  int crop_x_start_;
+  /*! \brief Indicate the max ratation angle for augmentation, we will random rotate */
+  /*! \brief [-max_rotate_angle, max_rotate_angle] */
+  int max_rotate_angle_;
+  /*! \brief max aspect ratio */
+  float max_aspect_ratio_;
+  /*! \brief random shear the image [-max_shear_ratio, max_shear_ratio] */
+  float max_shear_ratio_;
+  /*! \brief max crop size */
+  int max_crop_size_;
+  /*! \brief min crop size */
+  int min_crop_size_;
+  /*! \brief max scale ratio */
+  float max_random_scale_;
+  /*! \brief min scale_ratio */
+  float min_random_scale_;
+  /*! \brief min image size */
+  float min_img_size_;
+  /*! \brief max image size */
+  float max_img_size_;
+  /*! \brief whether to mirror the image */
+  bool mirror_;
+  /*! \brief rotate angle */
+  int rotate_;
+  /*! \brief filled color while padding */
+  int fill_value_;
+  // declare parameters
+  // TODO: didn't understand the range for some params
+  DMLC_DECLARE_PARAMETER(ImageAugmentParam) {
+    DMLC_DECLARE_FIELD(rand_crop_).set_default(true)
+        .describe("Whether we de random cropping");
+    DMLC_DECLARE_FIELD(crop_y_start_).set_default(-1)
+        .describe("Where to nonrandom crop on y");
+    DMLC_DECLARE_FIELD(crop_x_start_).set_default(-1)
+        .describe("Where to nonrandom crop on x");
+    DMLC_DECLARE_FIELD(max_rotate_angle_).set_default(0.0f)
+        .describe("Rotate can be [-max_rotate_angle, max_rotate_angle]");
+    DMLC_DECLARE_FIELD(max_aspect_ratio_).set_default(0.0f)
+        .describe("Max aspect ratio");
+    DMLC_DECLARE_FIELD(max_shear_ratio_).set_default(0.0f)
+        .describe("Shear rotate can be made between [-max_shear_ratio_, max_shear_ratio_]");
+    DMLC_DECLARE_FIELD(max_crop_size_).set_default(-1)
+        .describe("Maximum crop size");
+    DMLC_DECLARE_FIELD(min_crop_size_).set_default(-1)
+        .describe("Minimum crop size");
+    DMLC_DECLARE_FIELD(max_random_scale_).set_default(1.0f)
+        .describe("Maxmum scale ratio");
+    DMLC_DECLARE_FIELD(min_random_scale_).set_default(1.0f)
+        .describe("Minimum scale ratio");       
+    DMLC_DECLARE_FIELD(max_img_size_).set_default(1e10f)
+        .describe("Maxmum image size");
+    DMLC_DECLARE_FIELD(min_img_size_).set_default(0.0f)
+        .describe("Minimum image size");
+    DMLC_DECLARE_FIELD(mirror_).set_default(false)
+        .describe("Whether to mirror the image");
+    DMLC_DECLARE_FIELD(rotate_).set_default(-1.0f)
+        .describe("Rotate angle");
+    DMLC_DECLARE_FIELD(fill_value_).set_default(255)
+        .describe("Filled value while padding");
+};
+
+/*! \brief helper class to do image augmentation */
+class ImageAugmenter {
+ public:
+  // contructor
+  ImageAugmenter(void)
+      : tmpres(false),
+        rotateM(2, 3, CV_32F) {
+  }
+  virtual ~ImageAugmenter() {
+  }
+  // TODO: Hack the shape and rotate list, didn't use param
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+    for (size_t i = 0; i < kwargs_left.size(); i++) {
+        if (!strcmp(kwargs_left[i].first.c_str(), "input_shape")) {
+          CHECK(sscanf(kwargs_left[i].second.c_str(), "%u,%u,%u", &shape_[0], &shape_[1], &shape_[2]) == 3)
+                       << "input_shape must be three consecutive integers without space example: 1,1,200 ";
+        }
+        if (!strcmp(kwargs_left[i].first.c_str(), "rotate_list")) {
+          char* val = kwargs_left[i].second.c_str();
+          const char *end = val + strlen(val);
+          char buf[128];
+          while (val < end) {
+            sscanf(val, "%[^,]", buf);
+            val += strlen(buf) + 1;
+            rotate_list_.push_back(atoi(buf));
+          }
+        }
+    }
+  }
+  /*!
+   * \brief augment src image, store result into dst
+   *   this function is not thread safe, and will only be called by one thread
+   *   however, it will tries to re-use memory space as much as possible
+   * \param src the source image
+   * \param source of random number
+   * \param dst the pointer to the place where we want to store the result
+   */
+  virtual cv::Mat Process(const cv::Mat &src,
+                          common::RANDOM_ENGINE *prnd) {
+    // shear
+    float s = common::NextDouble(prnd) * param_.max_shear_ratio_ * 2 - param_.max_shear_ratio_;
+    // rotate
+    int angle = common::NextUInt32(param_.max_rotate_angle_ * 2, prnd) - param_.max_rotate_angle_;
+    if (param_.rotate_ > 0) angle = param_.rotate_;
+    if (rotate_list_.size() > 0) {
+      angle = rotate_list_[NextUInt32(rotate_list_.size() - 1, prnd)];
+    }
+    float a = cos(angle / 180.0 * M_PI);
+    float b = sin(angle / 180.0 * M_PI);
+    // scale
+    float scale = NextDouble(prnd) * (param_.max_random_scale_ - param_.min_random_scale_) + param_.min_random_scale_;
+    // aspect ratio
+    float ratio = NextDouble(prnd) * param_.max_aspect_ratio_ * 2 - param_.max_aspect_ratio_ + 1;
+    float hs = 2 * scale / (1 + ratio);
+    float ws = ratio * hs;
+    // new width and height
+    float new_width = std::max(param_.min_img_size_, std::min(param_.max_img_size_, scale * src.cols));
+    float new_height = std::max(param_.min_img_size_, std::min(param_.max_img_size_, scale * src.rows));
+    //printf("%f %f %f %f %f %f %f %f %f\n", s, a, b, scale, ratio, hs, ws, new_width, new_height);
+    cv::Mat M(2, 3, CV_32F);
+    M.at<float>(0, 0) = hs * a - s * b * ws;
+    M.at<float>(1, 0) = -b * ws;
+    M.at<float>(0, 1) = hs * b + s * a * ws;
+    M.at<float>(1, 1) = a * ws;
+    float ori_center_width = M.at<float>(0, 0) * src.cols + M.at<float>(0, 1) * src.rows;
+    float ori_center_height = M.at<float>(1, 0) * src.cols + M.at<float>(1, 1) * src.rows;
+    M.at<float>(0, 2) = (new_width - ori_center_width) / 2;
+    M.at<float>(1, 2) = (new_height - ori_center_height) / 2;
+    cv::warpAffine(src, temp, M, cv::Size(new_width, new_height),
+                     cv::INTER_LINEAR,
+                     cv::BORDER_CONSTANT,
+                     cv::Scalar(param_.fill_value_, param_.fill_value_, param_.fill_value_));
+    cv::Mat res = temp;
+    if (param_.max_crop_size_ != -1 || param_.min_crop_size_ != -1){
+      CHECK(res.cols >= param_.max_crop_size_ && res.rows >= param_.max_crop_size_&& param_.max_crop_size_ >= param_.min_crop_size_)
+          << "input image size smaller than max_crop_size";
+      mshadow::index_t rand_crop_size = NextUInt32(param_.max_crop_size_- param_.min_crop_size_+1, prnd)+ param_.min_crop_size_;
+      mshadow::index_t y = res.rows - rand_crop_size;
+      mshadow::index_t x = res.cols - rand_crop_size;
+      if (rand_crop_ != 0) {
+        y = NextUInt32(y + 1, prnd);
+        x = NextUInt32(x + 1, prnd);
+      }
+      else {
+        y /= 2; x /= 2;
+      }
+      cv::Rect roi(x, y, rand_crop_size, rand_crop_size);
+      cv::resize(res(roi), res, cv::Size(shape_[1], shape_[2]));
+    }
+    else{
+      utils::Check(static_cast<mshadow::index_t>(res.cols) >= shape_[1] && static_cast<mshadow::index_t>(res.rows) >= shape_[2],
+        "input image size smaller than input shape");
+      mshadow::index_t y = res.rows - shape_[2];
+      mshadow::index_t x = res.cols - shape_[1];
+      if (param_.rand_crop_ != 0) {
+        y = NextUInt32(y + 1, prnd);
+        x = NextUInt32(x + 1, prnd);
+      }
+      else {
+        y /= 2; x /= 2;
+      }
+      cv::Rect roi(x, y, shape_[1], shape_[2]);
+      res = res(roi);
+    }
+    return res;
+  }
+  /*!
+   * \brief augment src image, store result into dst
+   *   this function is not thread safe, and will only be called by one thread
+   *   however, it will tries to re-use memory space as much as possible
+   * \param src the source image
+   * \param source of random number
+   * \param dst the pointer to the place where we want to store the result
+   */
+  virtual mshadow::Tensor<cpu, 3> Process(mshadow::Tensor<cpu, 3> data,
+                                          common::RANDOM_ENGINE *prnd) {
+    if (!NeedProcess()) return data;
+    cv::Mat res(data.size(1), data.size(2), CV_8UC3);
+    for (index_t i = 0; i < data.size(1); ++i) {
+      for (index_t j = 0; j < data.size(2); ++j) {
+        res.at<cv::Vec3b>(i, j)[0] = data[2][i][j];
+        res.at<cv::Vec3b>(i, j)[1] = data[1][i][j];
+        res.at<cv::Vec3b>(i, j)[2] = data[0][i][j];
+      }
+    }
+    res = this->Process(res, prnd);
+    tmpres.Resize(mshadow::Shape3(3, res.rows, res.cols));
+    for (index_t i = 0; i < tmpres.size(1); ++i) {
+      for (index_t j = 0; j < tmpres.size(2); ++j) {
+        cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
+        tmpres[0][i][j] = bgr[2];
+        tmpres[1][i][j] = bgr[1];
+        tmpres[2][i][j] = bgr[0];
+      }
+    }
+    return tmpres;
+  }
+
+  virtual void Process(unsigned char *dptr, size_t sz,
+                       mshadow::TensorContainer<cpu, 3> *p_data,
+                       common::RANDOM_ENGINE *prnd) {
+    cv::Mat buf(1, sz, CV_8U, dptr);
+    cv::Mat res = cv::imdecode(buf, 1);
+    res = this->Process(res, prnd);
+    p_data->Resize(mshadow::Shape3(3, res.rows, res.cols));
+    for (index_t i = 0; i < p_data->size(1); ++i) {
+      for (index_t j = 0; j < p_data->size(2); ++j) {
+        cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
+        (*p_data)[0][i][j] = bgr[2];
+        (*p_data)[1][i][j] = bgr[1];
+        (*p_data)[2][i][j] = bgr[0];
+      }
+    }
+    res.release();
+  }
+
+ private:
+  // whether skip processing
+  inline bool NeedProcess(void) const {
+    if (max_rotate_angle_ > 0 || max_shear_ratio_ > 0.0f
+        || rotate_ > 0 || rotate_list_.size() > 0) return true;
+    if (min_crop_size_ > 0 && max_crop_size_ > 0) return true;
+    return false;
+  }
+  // temp input space
+  mshadow::TensorContainer<cpu, 3> tmpres;
+  // temporal space
+  cv::Mat temp0, temp, temp2;
+  // rotation param
+  cv::Mat rotateM;
+  // parameters
+  /*! \brief input shape */
+  mshadow::Shape<4> shape_;
+  /*! \brief list of possible rotate angle */
+  std::vector<int> rotate_list_;
+};
+}  // namespace io
+}  // namespace cxxnet
+#endif
diff --git a/src/io/image_recordio.h b/src/io/image_recordio.h
new file mode 100644
index 000000000000..4aea8aabcb47
--- /dev/null
+++ b/src/io/image_recordio.h
@@ -0,0 +1,75 @@
+/*!
+ * \file image_recordio.h
+ * \brief image recordio struct
+ */
+#ifndef MXNET_IO_IMAGE_RECORDIO_H_
+#define MXNET_IO_IMAGE_RECORDIO_H_
+
+#include <dmlc/base.h>
+#include <dmlc/io.h>
+
+namespace mxnet {
+namespace io {
+/*! \brief image recordio struct */
+struct ImageRecordIO {
+  /*! \brief header in image recordio */
+  struct Header {
+    /*!
+     * \brief flag of the header,
+     *  used for future extension purposes
+     */
+    uint32_t flag;
+    /*!
+     * \brief label field that returns label of images
+     *  when image list was not presented,
+     * 
+     * NOTE: user do not need to repack recordio just to
+     * change label field, just supply a list file that
+     * maps image id to new labels
+     */
+    float label;
+    /*!
+     * \brief unique image index
+     *  image_id[1] is always set to 0,
+     *  reserved for future purposes for 128bit id
+     *  image_id[0] is used to store image id
+     */
+    uint64_t image_id[2];
+  };
+  /*! \brief header of image recordio */
+  Header header;
+  /*! \brief pointer to data content */
+  uint8_t *content;
+  /*! \brief size of the content */
+  size_t content_size;
+  /*! \brief constructor */
+  ImageRecordIO(void)
+      : content(NULL), content_size(0) {
+    memset(&header, 0, sizeof(header));
+  }
+  /*! \brief get image id from record */
+  inline uint64_t image_index(void) const {
+    return header.image_id[0];
+  }
+  /*!
+   * \brief load header from a record content 
+   * \param buf the head of record
+   * \param size the size of the entire record   
+   */
+  inline void Load(void *buf, size_t size) {
+    CHECK(size >= sizeof(header));
+    std::memcpy(&header, buf, sizeof(header));
+    content = reinterpret_cast<uint8_t*>(buf) + sizeof(header);
+    content_size = size - sizeof(header);
+  }
+  /*!
+   * \brief save the record header
+   */
+  inline void SaveHeader(std::string *blob) const {
+    blob->resize(sizeof(header));
+    std::memcpy(dmlc::BeginPtr(*blob), &header, sizeof(header));    
+  }  
+}; 
+}  // namespace io
+}  // namespace mxnet
+#endif  // MXNET_IO_IMAGE_RECORDIO_H_
diff --git a/src/io/iter_batch.h b/src/io/iter_batch.h
new file mode 100644
index 000000000000..a0e4ab7e7ba5
--- /dev/null
+++ b/src/io/iter_batch.h
@@ -0,0 +1,162 @@
+/*!
+ * \file iter_batch_proc-inl.hpp
+ * \brief definition of preprocessing iterators that takes an iterator and do some preprocessing
+ * \author Tianqi Chen
+ */
+#ifndef MXNET_IO_ITER_BATCH_H_
+#define MXNET_IO_ITER_BATCH_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <dmlc/logging.h>
+#include <mshadow/tensor.h>
+
+namespace mxnet {
+namespace io {
+// Batch parameters
+struct BatchParam : public dmlc::Parameter<BatchParam> {
+  /*! \brief label width */
+  index_t batch_size_;
+  /*! \brief label width */
+  index_t label_width_;
+  /*! \brief use round roubin to handle overflow batch */
+  bool round_batch_;
+  /*! \brief skip read */
+  bool test_skipread_;
+  /*! \brief silent */
+  bool silent_;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(BatchParam) {
+    DMLC_DECLARE_FIELD(batch_size_).set_default(1)
+        .describe("Batch size.");
+    DMLC_DECLARE_FIELD(label_width_).set_default(1)
+        .describe("Label width.");
+    DMLC_DECLARE_FIELD(round_batch_).set_default(false)
+        .describe("Use round robin to handle overflow batch.");
+    DMLC_DECLARE_FIELD(test_skipread_).set_default(false)
+        .describe("Skip read for testing.");
+    DMLC_DECLARE_FIELD(silent_).set_default(false)
+        .describe("Whether to print batch information.")
+  }
+};
+    
+/*! \brief create a batch iterator from single instance iterator */
+class BatchAdaptIter: public IIterator<DataBatch> {
+public:
+  BatchAdaptIter(IIterator<DataInst> *base): base_(base) {
+    num_overflow_ = 0;
+  }
+  virtual ~BatchAdaptIter(void) {
+    delete base_;
+    out_.FreeSpaceDense();
+  }
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    // init batch param, it could have similar param with 
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+    for (size_t i = 0; i < kwargs_left.size(); i++) {
+      if (!strcmp(kwargs_left[i].first.c_str(), "input_shape")) {
+        CHECK(sscanf(kwargs_left[i].second.c_str(), "%u,%u,%u", &shape_[1], &shape_[2], &shape_[3]) == 3)
+          << "input_shape must be three consecutive integers without space example: 1,1,200 ")
+      }
+    }
+    // init base iterator
+    base_->Init(kwargs);
+    mshadow::Shape<4> tshape = shape_;
+    tshape[0] = param_.batch_size_;
+    AllocSpaceDense(false);
+  }
+  virtual void BeforeFirst(void) {
+    if (param_.round_batch_ == 0 || num_overflow_ == 0) {
+      // otherise, we already called before first
+      base_->BeforeFirst();
+    } else {
+      num_overflow_ = 0;
+    }
+    head_ = 1;
+  }
+  virtual bool Next(void) {
+    out_.num_batch_padd = 0;
+
+    // skip read if in head version
+    if (param_.test_skipread_ != 0 && head_ == 0) return true;
+    else this->head_ = 0;
+
+    // if overflow from previous round, directly return false, until before first is called
+    if (num_overflow_ != 0) return false;
+    index_t top = 0;
+
+    while (base_->Next()) {
+      const DataInst& d = base_->Value();
+      mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 2, float>());
+      out_.inst_index[top] = d.index;
+      mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 4, float>());
+
+      if (++ top >= param_.batch_size_) {
+        out.data[0] = TBlob(data);
+        out.data[1] = TBlob(label);
+        return true;
+      }
+    }
+    if (top != 0) {
+      if (param_.round_batch_ != 0) {
+        num_overflow_ = 0;
+        base_->BeforeFirst();
+        for (; top < param_.batch_size_; ++top, ++num_overflow_) {
+          CHECK(base_->Next()) << "number of input must be bigger than batch size";
+          const DataInst& d = base_->Value();
+          mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 2, float>());
+          out_.inst_index[top] = d.index;
+          mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 4, float>());
+        }
+        out_.num_batch_padd = num_overflow_;
+      } else {
+        out_.num_batch_padd = batch_size_ - top;
+      }
+      out.data[0] = TBlob(data);
+      out.data[1] = TBlob(label);
+      return true;
+    }
+    return false;
+  }
+  virtual const DataBatch &Value(void) const {
+    CHECK(head_ == 0) << "must call Next to get value";
+    return out_;
+  }
+private:
+  /*! \brief base iterator */
+  IIterator<DataInst> *base_;
+  /*! \brief input shape */
+  mshadow::Shape<4> shape_;
+  /*! \brief output data */
+  DataBatch out_;
+  /*! \brief on first */
+  int head_;
+  /*! \brief number of overflow instances that readed in round_batch mode */
+  int num_overflow_;
+  /*! \brief label information of the data*/
+  mshadow::Tensor<mshadow::cpu, 2> label;
+  /*! \brief content of dense data, if this DataBatch is dense */
+  mshadow::Tensor<mshadow::cpu, 4> data;
+  // Functions that allocate and free tensor space
+  inline void AllocSpaceDense(bool pad = false) { 
+    data = mshadow::NewTensor<mshadow::cpu>(shape_, 0.0f, pad);
+    mshadow::Shape<2> lshape = mshadow::Shape2(batch_size, label_width);
+    label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
+    out_.inst_index = new unsigned[batch_size];
+    out_.batch_size = batch_size;
+    out_.data.resize(2);
+  }
+  /*! \brief auxiliary function to free space, if needed, dense only */
+  inline void FreeSpaceDense(void) {
+    if (label.dptr_ != NULL) {
+      delete [] inst_index;
+      mshadow::FreeSpace(&label);
+      mshadow::FreeSpace(&data);
+      label.dptr_ = NULL;
+    }
+  }
+}; // class BatchAdaptIter
+}  // namespace io
+}  // namespace cxxnet
+#endif  // MXNET_IO_ITER_BATCH_H_
\ No newline at end of file
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
new file mode 100644
index 000000000000..2ab1aa8958cb
--- /dev/null
+++ b/src/io/iter_image_recordio.cc
@@ -0,0 +1,369 @@
+/*!
+ * \file iter_image_recordio-inl.hpp
+ * \brief recordio data
+iterator
+ */
+#include <cstdlib>
+#include <dmlc/base.h>
+#include <dmlc/io.h>
+#include <dmlc/omp.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <dmlc/recordio.h>
+#include <dmlc/threadediter.h>
+#include <unordered_map>
+#include <vector>
+#include "./inst_vector.h"
+#include "./image_recordio.h"
+#include "./image_augmenter.h"
+#include "../utils/decoder.h"
+namespace mxnet {
+namespace io {
+/*! \brief data structure to hold labels for images */
+class ImageLabelMap {
+ public:
+  /*!
+   * \brief initialize the label list into memory
+   * \param path_imglist path to the image list
+   * \param label_width predefined label_width
+   */
+  explicit ImageLabelMap(const char *path_imglist,
+                         mshadow::index_t label_width,
+                         bool silent) {
+    label_width_ = label_width;
+    image_index_.clear();
+    label_.clear();
+    idx2label_.clear();
+    dmlc::InputSplit *fi = dmlc::InputSplit::Create
+        (path_imglist, 0, 1, "text");
+    dmlc::InputSplit::Blob rec;
+    while (fi->NextRecord(&rec)) {
+      // quick manual parsing
+      char *p = reinterpret_cast<char*>(rec.dptr);
+      char *end = p + rec.size;
+      // skip space
+      while (isspace(*p) && p != end) ++p;
+      image_index_.push_back(static_cast<size_t>(atol(p)));
+      for (size_t i = 0; i < label_width_; ++i) {
+        // skip till space
+        while (!isspace(*p) && p != end) ++p;
+        // skip space
+        while (isspace(*p) && p != end) ++p;
+        CHECK(p != end) << "Bad ImageList format";
+        label_.push_back(static_cast<real_t>(atof(p)));
+      }
+    }
+    delete fi;
+    // be careful not to resize label_ afterwards
+    idx2label_.reserve(image_index_.size());
+    for (size_t i = 0; i < image_index_.size(); ++i) {
+      idx2label_[image_index_[i]] = BeginPtr(label_) + i * label_width_;
+    }
+    if (!silent) {
+      LOG(INFO) << "Loaded ImageList from " << path_imglist << ' '
+                << image_index_.size() << " Image records";
+    }
+  }
+  /*! \brief find a label for corresponding index */
+  inline mshadow::Tensor<cpu, 1> Find(size_t imid) const {
+    std::unordered_map<size_t, real_t*>::const_iterator it
+        = idx2label_.find(imid);
+    CHECK(it != idx2label_.end()) << "fail to find imagelabel for id " << imid;
+    return mshadow::Tensor<cpu, 1>(it->second, mshadow::Shape1(label_width_));
+  }
+
+ private:
+  // label with_
+  mshadow::index_t label_width_;
+  // image index of each record
+  std::vector<size_t> image_index_;
+  // real label content
+  std::vector<real_t> label_;
+  // map index to label
+  std::unordered_map<size_t, real_t*> idx2label_;
+};
+
+// Define image record parser parameters
+struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
+  /*! \brief path to image list */
+  std::string path_imglist_;
+  /*! \brief path to image recordio */
+  std::string path_imgrec_;
+  /*! \brief number of threads */
+  int nthread_;
+  /*! \brief whether to remain silent */
+  bool silent_;
+  /*! \brief number of distributed worker */
+  int dist_num_worker_, dist_worker_rank_;
+  /*! \brief label-width */
+  int label_width_;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageRecParserParam) {
+    DMLC_DECLARE_FIELD(path_imglist_).set_default("")
+        .describe("Path to image list.");
+    DMLC_DECLARE_FIELD(path_imagrec_).set_default("./data/imgrec.rec")
+        .describe("Path to image record file.");
+    DMLC_DECLARE_FIELD(nthread_).set_lower_bound(1).set_default(4)
+        .describe("Number of thread to do parsing.");
+    DMLC_DECLARE_FIELD(label_width_).set_lower_bound(1).set_default(1)
+        .describe("How many labels for an image.");
+    DMLC_DECLARE_FIELD(silent_).set_default(false)
+        .describe("Whether to output parser information.");
+    DMLC_DECLARE_FIELD(dist_num_worker_).set_lower_bound(1).set_default(1)
+        .describe("Dist worker number.");
+    DMLC_DECLARE_FIELD(dist_worker_rank_).set_default(0)
+        .describe("Dist worker rank.");
+  }
+};
+
+// parser to parse image recordio
+class ImageRecordIOParser {
+ public:
+  ImageRecordIOParser(void)
+      : source_(NULL),
+        label_map_(NULL) {
+  }
+  ~ImageRecordIOParser(void) {
+    // can be NULL
+    delete label_map_;
+    delete source_;
+    for (size_t i = 0; i < augmenters_.size(); ++i) {
+      delete augmenters_[i];
+    }
+    for (size_t i = 0; i < prnds_.size(); ++i) {
+      delete prnds_[i];
+    }
+  }
+  // initialize the parser
+  inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs);
+  
+  // set record to the head
+  inline void BeforeFirst(void) {
+    return source_->BeforeFirst();
+  }
+  // parse next set of records, return an array of
+  // instance vector to the user
+  inline bool ParseNext(std::vector<InstVector> *out);
+ private:
+  // magic nyumber to see prng
+  static const int kRandMagic = 111;
+  /*! \brief parameters */
+  ImageRecParserParam param_; 
+  /*! \brief augmenters */
+  std::vector<ImageAugmenter*> augmenters_;
+  /*! \brief random samplers */
+  std::vector<common::RANDOM_ENGINE*> prnds_;
+  /*! \brief data source */
+  dmlc::InputSplit *source_;
+  /*! \brief label information, if any */
+  ImageLabelMap *label_map_;
+};
+
+inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+  // initialize parameter
+  std::vector<std::pair<std::string, std::string> > kwargs_left;
+  // init image rec param
+  kwargs_left = param_.InitAllowUnknown(kwargs);
+  int maxthread, threadget;
+  #pragma omp parallel
+  {
+    maxthread = std::max(omp_get_num_procs() / 2 - 1, 1);
+  }
+  param_.nthread_ = std::min(maxthread, param_.nthread_);
+  #pragma omp parallel num_threads(param_.nthread_)
+  {
+    threadget = omp_get_num_threads();
+  }
+  param_.nthread_ = threadget;
+  // setup decoders
+  for (int i = 0; i < threadget; ++i) {
+    augmenters_.push_back(new ImageAugmenter());
+    augmenters_[i].init(kwargs_left);
+    prnds_.push_back(new common::RANDOM_ENGINE((i + 1) * kRandMagic));
+  }
+  
+  // handling for hadoop
+  // TODO, hack
+  const char *ps_rank = getenv("PS_RANK");
+  if (ps_rank != NULL) {
+    param_.dist_worker_rank = atoi(ps_rank);
+  }
+
+  if (param_.path_imglist_.length() != 0) {
+    label_map_ = new ImageLabelMap(param_.path_imglist_.c_str(),
+                                   param_.label_width_, silent_ != 0);
+  } else {
+    param_.label_width_ = 1;
+  }
+  CHECK(path_imgrec_.length() != 0)
+    << "ImageRecordIOIterator: must specify image_rec";
+#if MSHADOW_DIST_PS
+    // TODO move to a better place
+    param_.dist_num_worker_ = ::ps::RankSize();
+    param_.dist_worker_rank_ = ::ps::MyRank();
+    LOG(INFO) << "rank " << param_.dist_worker_rank_
+              << " in " << param_.dist_num_worker_;
+#endif
+  source_ = dmlc::InputSplit::Create
+      (param_.path_imgrec_.c_str(), param_.dist_worker_rank_,
+       param_.dist_num_worker_, "recordio");
+  // use 64 MB chunk when possible
+  source_->HintChunkSize(8 << 20UL);
+}
+
+inline bool ImageRecordIOParser::
+ParseNext(std::vector<InstVector> *out_vec) {
+  CHECK(source_ != NULL);
+  dmlc::InputSplit::Blob chunk;
+  if (!source_->NextChunk(&chunk)) return false;
+  out_vec->resize(param_.nthread_);
+  #pragma omp parallel num_threads(param_.nthread_)
+  {
+    CHECK(omp_get_num_threads() == param_.nthread_);
+    int tid = omp_get_thread_num();
+    dmlc::RecordIOChunkReader reader(chunk, tid, parser_.nthread_);
+    mxnet::ImageRecordIO rec;
+    dmlc::InputSplit::Blob blob;
+    // image data
+    InstVector &out = (*out_vec)[tid];
+    out.Clear();
+    while (reader.NextRecord(&blob)) {
+      // result holder
+      cv::Mat res;
+      rec.Load(blob.dptr, blob.size);
+      cv::Mat buf(1, rec.content_size, CV_8U, rec.content);
+      res = cv::imdecode(buf, 1);
+      res = augmenters_[tid]->Process(res, prnds_[tid]);
+      out.Push(static_cast<unsigned>(rec.image_index()),
+               mshadow::Shape3(3, res.rows, res.cols),
+               mshadow::Shape1(param_.label_width_));
+      DataInst inst = out.Back();
+      for (int i = 0; i < res.rows; ++i) {
+        for (int j = 0; j < res.cols; ++j) {
+          cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
+          inst.data[0][i][j] = bgr[2];
+          inst.data[1][i][j] = bgr[1];
+          inst.data[2][i][j] = bgr[0];
+        }
+      }
+      if (label_map_ != NULL) {
+        mshadow::Copy(inst.label, label_map_->Find(rec.image_index()));
+      } else {
+        inst.label[0] = rec.header.label;
+      }
+      res.release();
+    }
+  }
+  return true;
+}
+
+// Define image record parameters
+struct ImageRecordParam: public dmlc::Parameter<ImageRecordParam> {
+  /*! \brief whether to do shuffle */
+  bool shuffle;
+  /*! \brief random seed */
+  int seed;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageRecordParam) {
+    DMLC_DECLARE_FIELD(shuffle).set_default(true)
+        .describe("Whether to shuffle data.");
+    DMLC_DECLARE_FIELD(seed).set_default(0)
+        .describe("Random Seed.");
+  }
+};
+
+// iterator on image recordio
+class ImageRecordIter : public IIterator<DataInst> {
+ public:
+  ImageRecordIter()
+      : data_(NULL) {
+  }
+  virtual ~ImageRecordIter(void) {
+    iter_.Destroy();
+    // data can be NULL
+    delete data_;
+  }
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    // init image rec param
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+    // use the left kwarg to init parser
+    parser_.Init(kwargs_left);
+    // init thread iter
+    iter_.set_max_capacity(4);
+    iter_.Init([this](std::vector<InstVector> **dptr) {
+        if (*dptr == NULL) {
+          *dptr = new std::vector<InstVector>();
+        }
+        return parser_.ParseNext(*dptr);
+      },
+      [this]() { parser_.BeforeFirst(); });
+    inst_ptr_ = 0;
+  }
+  virtual void BeforeFirst(void) {
+    iter_.BeforeFirst();
+    inst_order_.clear();
+    inst_ptr_ = 0;
+  }
+  virtual bool Next(void) {
+    while (true) {
+      if (inst_ptr_ < inst_order_.size()) {
+        std::pair<unsigned, unsigned> p = inst_order_[inst_ptr_];
+        out_ = (*data_)[p.first][p.second];
+        ++inst_ptr_;
+        return true;
+      } else {
+        if (data_ != NULL) iter_.Recycle(&data_);
+        if (!iter_.Next(&data_)) return false;
+        inst_order_.clear();
+        for (unsigned i = 0; i < data_->size(); ++i) {
+          const InstVector &tmp = (*data_)[i];
+          for (unsigned j = 0; j < tmp.Size(); ++j) {
+            inst_order_.push_back(std::make_pair(i, j));
+          }
+        }
+        // shuffle instance order if needed
+        if (shuffle_ != 0) {
+            std::shuffle(inst_order_.begin(), inst_.end(), common::RANDOM_ENGINE(kRandMagic + param_.seed));
+        }
+        inst_ptr_ = 0;
+      }
+    }
+    return false;
+  }
+  virtual const DataInst &Value(void) const {
+    return out_;
+  }
+
+ private:
+  // random magic
+  static const int kRandMagic = 111;
+  // output instance
+  DataInst out_;
+  // whether shuffle data
+  int shuffle_;
+  // data ptr
+  size_t inst_ptr_;
+  // internal instance order
+  std::vector<std::pair<unsigned, unsigned> > inst_order_;
+  // data
+  std::vector<InstVector> *data_;
+  // internal parser
+  ImageRecordIOParser parser_;
+  // backend thread
+  dmlc::ThreadedIter<std::vector<InstVector> > iter_;
+  // parameters
+  ImageRecParserParam param_;
+};
+DMLC_REGISTER_PARAMETER(ImageRecParserParam);
+DMLC_REGISTER_PARAMETER(ImageRecordParam);
+MXNET_REGISTER_IO_ITER(MNISTIter, MNISTIter)
+MXNET_REGISTER_IO_CHAINED_ITER(ImageRecordIter, ImageRecordIter, BatchAdaptIter)
+    .describe("Create iterator for dataset packed in recordio.")
+    .add_arguments(ImageRecordParam::__FIELDS__())
+    .add_arguments(ImageRecParserParam::__FIELDS__())
+    .add_arguments(BatchParam::__FIELDS__())
+    .add_arguments(ImageAugmenterParam::__FIELDS__());
+}  // namespace io
+}  // namespace mxnet
+#endif  // ITER_IMAGE_RECORDIO_INL_HPP_
diff --git a/src/utils/decoder.h b/src/utils/decoder.h
new file mode 100644
index 000000000000..17203392cc60
--- /dev/null
+++ b/src/utils/decoder.h
@@ -0,0 +1,128 @@
+#ifndef MXNET_UTILS_DECODER_H_
+#define MXNET_UTILS_DECODER_H_
+
+#include <vector>
+#if MXNET_USE_OPENCV_DECODER == 0
+  #include <jpeglib.h>
+  #include <setjmp.h>
+  #include <jerror.h>
+#endif
+#include <dmlc/logging.h>
+#include <mshadow/tensor.h>
+#if MXNET_USE_OPENCV
+  #include <opencv2/opencv.hpp>
+#endif
+
+namespace cxxnet {
+namespace utils {
+
+#if MXNET_USE_OPENCV_DECODER == 0
+struct JpegDecoder {
+public:
+  JpegDecoder(void) {
+    cinfo.err = jpeg_std_error(&jerr.base);
+    jerr.base.error_exit = jerror_exit;
+    jerr.base.output_message = joutput_message;
+    jpeg_create_decompress(&cinfo);
+  }
+  // destructor
+  ~JpegDecoder(void) {
+    jpeg_destroy_decompress(&cinfo);
+  }
+
+  inline void Decode(unsigned char *ptr, size_t sz,
+                     mshadow::TensorContainer<cpu, 3, unsigned char> *p_data) {
+    if(setjmp(jerr.jmp)) {
+      jpeg_destroy_decompress(&cinfo);
+      dmlc::Error("Libjpeg fail to decode");
+    }
+    this->jpeg_mem_src(&cinfo, ptr, sz);
+    CHECK(jpeg_read_header(&cinfo, TRUE) == JPEG_HEADER_OK) << "libjpeg: failed to decode";
+    CHECK(jpeg_start_decompress(&cinfo) == true) << "libjpeg: failed to decode";
+    p_data->Resize(mshadow::Shape3(cinfo.output_height, cinfo.output_width, cinfo.output_components));
+    JSAMPROW jptr = &((*p_data)[0][0][0]);
+    while (cinfo.output_scanline < cinfo.output_height) {
+      CHECK(jpeg_read_scanlines(&cinfo, &jptr, 1) == true) << "libjpeg: failed to decode";
+      jptr += cinfo.output_width * cinfo.output_components;
+    }
+    CHECK(jpeg_finish_decompress(&cinfo) == true) << "libjpeg: failed to decode");
+  }
+private:
+  struct jerror_mgr {
+    jpeg_error_mgr base;
+    jmp_buf jmp;
+  };
+
+  METHODDEF(void) jerror_exit(j_common_ptr jinfo) {
+    jerror_mgr* err = (jerror_mgr*)jinfo->err;
+    longjmp(err->jmp, 1);
+  }
+
+  METHODDEF(void) joutput_message(j_common_ptr) {}
+
+  static boolean mem_fill_input_buffer_ (j_decompress_ptr cinfo) {
+    dmlc::Error("JpegDecoder: bad jpeg image");
+    return true;
+  }
+
+  static void mem_skip_input_data_ (j_decompress_ptr cinfo, long num_bytes_) {
+    jpeg_source_mgr *src = cinfo->src;
+    size_t num_bytes = static_cast<size_t>(num_bytes_);
+    if (num_bytes > 0) {
+      src->next_input_byte += num_bytes;
+      CHECK(src->bytes_in_buffer >= num_bytes) << "fail to decode";
+      src->bytes_in_buffer -= num_bytes;
+    } else {
+      dmlc::Error("JpegDecoder: bad jpeg image");
+
+    }
+  }
+
+  static void mem_term_source_ (j_decompress_ptr cinfo) {}
+  static void mem_init_source_ (j_decompress_ptr cinfo) {}
+  static boolean jpeg_resync_to_restart_(j_decompress_ptr cinfo, int desired) {
+    dmlc::Error("JpegDecoder: bad jpeg image");
+    return true;
+  }
+  void jpeg_mem_src (j_decompress_ptr cinfo, void* buffer, long nbytes) {
+    src.init_source = mem_init_source_;
+    src.fill_input_buffer = mem_fill_input_buffer_;
+    src.skip_input_data = mem_skip_input_data_;
+    src.resync_to_restart = jpeg_resync_to_restart_;
+    src.term_source = mem_term_source_;
+    src.bytes_in_buffer = nbytes;
+    src.next_input_byte = static_cast<JOCTET*>(buffer);
+    cinfo->src = &src;
+  }
+
+private:
+  jpeg_decompress_struct cinfo;
+  jpeg_source_mgr src;
+  jerror_mgr jerr;
+};
+#endif
+
+#if MXNET_USE_OPENCV
+struct OpenCVDecoder {
+  void Decode(unsigned char *ptr, size_t sz, mshadow::TensorContainer<cpu, 3, unsigned char> *p_data) {
+    cv::Mat buf(1, sz, CV_8U, ptr);
+    cv::Mat res = cv::imdecode(buf, 1);
+    CHECK(res.data != NULL) << "decoding fail";
+    p_data->Resize(mshadow::Shape3(res.rows, res.cols, 3));
+    for (int y = 0; y < res.rows; ++y) {
+      for (int x = 0; x < res.cols; ++x) {
+        cv::Vec3b bgr = res.at<cv::Vec3b>(y, x);
+        // store in RGB order
+        (*p_data)[y][x][2] = bgr[0];
+        (*p_data)[y][x][1] = bgr[1];
+        (*p_data)[y][x][0] = bgr[2];
+      }
+    }
+    res.release();
+  }
+};
+#endif
+} // namespace utils
+} // namespace mxnet
+
+#endif // DECODER_H
diff --git a/src/utils/io.h b/src/utils/io.h
new file mode 100644
index 000000000000..3781ce98b012
--- /dev/null
+++ b/src/utils/io.h
@@ -0,0 +1,175 @@
+#ifndef CXXNET_UTILS_IO_H_
+#define CXXNET_UTILS_IO_H_
+/*!
+ * \file io.h
+ * \brief definition of abstract stream interface for IO
+ * \author Bing Xu Tianqi Chen
+ */
+#include "./utils.h"
+#include <dmlc/io.h>
+#include <string>
+#include <algorithm>
+#include <cstring>
+
+namespace cxxnet {
+namespace utils {
+typedef dmlc::Stream IStream;
+typedef dmlc::SeekStream ISeekStream;
+
+/*! \brief a in memory buffer that can be read and write as stream interface */
+struct MemoryBufferStream : public ISeekStream {
+ public:
+  MemoryBufferStream(std::string *p_buffer)
+      : p_buffer_(p_buffer) {
+    curr_ptr_ = 0;
+  }
+  virtual ~MemoryBufferStream(void) {}
+  virtual size_t Read(void *ptr, size_t size) {
+    CHECK(curr_ptr_ <= p_buffer_->length())
+          << " read can not have position excceed buffer length";
+    size_t nread = std::min(p_buffer_->length() - curr_ptr_, size);
+    if (nread != 0) memcpy(ptr, &(*p_buffer_)[0] + curr_ptr_, nread);
+    curr_ptr_ += nread;
+    return nread;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    if (size == 0) return;
+    if (curr_ptr_ + size > p_buffer_->length()) {
+      p_buffer_->resize(curr_ptr_+size);
+    }
+    memcpy(&(*p_buffer_)[0] + curr_ptr_, ptr, size);
+    curr_ptr_ += size;
+  }
+  virtual void Seek(size_t pos) {
+    curr_ptr_ = static_cast<size_t>(pos);
+  }
+  virtual size_t Tell(void) {
+    return curr_ptr_;
+  }
+
+ private:
+  /*! \brief in memory buffer */
+  std::string *p_buffer_;
+  /*! \brief current pointer */
+  size_t curr_ptr_;
+}; // class MemoryBufferStream
+
+/*! \brief implementation of file i/o stream */
+class StdFile: public ISeekStream {
+ public:
+  /*! \brief constructor */
+  StdFile(const char *fname, const char *mode) {
+    Open(fname, mode);
+  }
+  StdFile() {}
+  virtual ~StdFile(void) {
+    this->Close();
+  }
+  virtual void Open(const char *fname, const char *mode) {
+    fp_ = utils::FopenCheck(fname, mode);
+    fseek(fp_, 0L, SEEK_END);
+    sz_ = ftell(fp_);
+    fseek(fp_, 0L, SEEK_SET);
+  }
+  virtual size_t Read(void *ptr, size_t size) {
+    return fread(ptr, size, 1, fp_);
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    fwrite(ptr, size, 1, fp_);
+  }
+  virtual void Seek(size_t pos) {
+    fseek(fp_, pos, SEEK_SET);
+  }
+  virtual size_t Tell(void) {
+    return static_cast<size_t>(ftell(fp_));
+  }
+  inline void Close(void) {
+    if (fp_ != NULL){
+      fclose(fp_); fp_ = NULL;
+    }
+  }
+  inline size_t Size() {
+    return sz_;
+  }
+ private:
+  FILE *fp_;
+  size_t sz_;
+}; // class StdFile
+
+/*! \brief Basic page class */
+class BinaryPage {
+ public:
+  /*! \brief page size 64 MB */
+  static const size_t kPageSize = 64 << 18;
+ public:
+  /*! \brief memory data object */
+  struct Obj{
+    /*! \brief pointer to the data*/
+    void  *dptr;
+    /*! \brief size */
+    size_t sz;
+    Obj(void * dptr, size_t sz) : dptr(dptr), sz(sz){}
+  };
+ public:
+  /*! \brief constructor of page */
+  BinaryPage(void)  {
+    data_ = new int[kPageSize];
+    utils::Check(data_ != NULL, "fail to allocate page, out of space");
+    this->Clear();
+  };
+  ~BinaryPage() {
+    if (data_) delete [] data_;
+  }
+  /*!
+   * \brief load one page form instream
+   * \return true if loading is successful
+   */
+  inline bool Load(utils::IStream &fi) {
+    return fi.Read(&data_[0], sizeof(int)*kPageSize) !=0;
+  }
+  /*! \brief save one page into outstream */
+  inline void Save(utils::IStream &fo) {
+    fo.Write(&data_[0], sizeof(int)*kPageSize);
+  }
+  /*! \return number of elements */
+  inline int Size(void){
+    return data_[0];
+  }
+  /*! \brief Push one binary object into page
+   *  \param fname file name of obj need to be pushed into
+   *  \return false or true to push into
+   */
+  inline bool Push(const Obj &dat) {
+    if(this->FreeBytes() < dat.sz + sizeof(int)) return false;
+    data_[ Size() + 2 ] = data_[ Size() + 1 ] + dat.sz;
+    memcpy(this->offset(data_[ Size() + 2 ]), dat.dptr, dat.sz);
+    ++ data_[0];
+    return true;
+  }
+  /*! \brief Clear the page */
+  inline void Clear(void) {
+    memset(&data_[0], 0, sizeof(int) * kPageSize);
+  }
+  /*!
+   * \brief Get one binary object from page
+   *  \param r r th obj in the page
+   */
+  inline Obj operator[](int r) {
+    CHECK(r < Size());
+    return Obj(this->offset(data_[ r + 2 ]),  data_[ r + 2 ] - data_[ r + 1 ]);
+  }
+ private:
+  /*! \return number of elements */
+  inline size_t FreeBytes(void) {
+    return (kPageSize - (Size() + 2)) * sizeof(int) - data_[ Size() + 1 ];
+  }
+  inline void* offset(int pos) {
+    return (char*)(&data_[0]) + (kPageSize*sizeof(int) - pos);
+  }
+ private:
+  //int data_[ kPageSize ];
+  int *data_;
+};  // class BinaryPage
+}  // namespace utils
+}  // namespace cxxnet
+#endif
diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h
new file mode 100644
index 000000000000..7df1ae17aa56
--- /dev/null
+++ b/src/utils/thread_buffer.h
@@ -0,0 +1,205 @@
+#ifndef CXXNET_UTILS_THREAD_BUFFER_H_
+#define CXXNET_UTILS_THREAD_BUFFER_H_
+/*!
+ * \file thread_buffer.h
+ * \brief  multi-thread buffer, iterator, can be used to create parallel pipeline
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <cstring>
+#include <cstdlib>
+#include "./utils.h"
+#include "./thread.h"
+namespace cxxnet {
+namespace utils {
+/*!
+ * \brief buffered loading iterator that uses multithread
+ * this template method will assume the following paramters
+ * \tparam Elem elememt type to be buffered
+ * \tparam ElemFactory factory type to implement in order to use thread buffer
+ */
+template<typename Elem, typename ElemFactory>
+class ThreadBuffer {
+ public:
+  /*!\brief constructor */
+  ThreadBuffer(void) {
+    this->init_end = false;
+    this->buf_size = 30;
+  }
+  ~ThreadBuffer(void) {
+    if(init_end) this->Destroy();
+  }
+  /*!\brief set parameter, will also pass the parameter to factory */
+  inline void SetParam(const char *name, const char *val) {
+    if (!strcmp( name, "buffer_size")) buf_size = atoi(val);
+    factory.SetParam(name, val);
+  }
+  /*!
+   * \brief initalize the buffered iterator
+   * \param param a initialize parameter that will pass to factory, ignore it if not necessary
+   * \return false if the initlization can't be done, e.g. buffer file hasn't been created 
+   */
+  inline bool Init(void) {
+    if (!factory.Init()) return false;
+    bufA.reserve(buf_size);
+    bufB.reserve(buf_size);
+    for (int i = 0; i < buf_size; ++i) {
+      bufA.push_back(factory.Create());
+      bufB.push_back(factory.Create());
+    }
+    this->init_end = true;
+    this->StartLoader();
+    return true;
+  }  
+  /*!\brief place the iterator before first value */
+  inline void BeforeFirst(void) {
+    // wait till last loader end
+    loading_end.Wait();
+    // critcal zone
+    current_buf = 1;
+    factory.BeforeFirst();
+    // reset terminate limit
+    endA = endB = buf_size;
+    // wake up loader for first part
+    loading_need.Post();
+    // wait til first part is loaded
+    loading_end.Wait();
+    // set current buf to right value
+    current_buf = 0;
+    // wake loader for next part
+    data_loaded = false;
+    loading_need.Post();
+    // set buffer value
+    buf_index = 0;
+  }  
+  /*! \brief destroy the buffer iterator, will deallocate the buffer */
+  inline void Destroy(void) {
+    // wait until the signal is consumed
+    this->destroy_signal = true;
+    loading_need.Post();
+    loader_thread.Join();
+    loading_need.Destroy();
+    loading_end.Destroy();    
+    for (size_t i = 0; i < bufA.size(); ++i) {
+      factory.FreeSpace(bufA[i]);
+    }
+    for (size_t i = 0; i < bufB.size(); ++i) {
+      factory.FreeSpace(bufB[i]);
+    }
+    bufA.clear(); bufB.clear();
+    factory.Destroy();
+    this->init_end = false;
+  }  
+  /*!
+   * \brief get the next element needed in buffer
+   * \param elem element to store into
+   * \return whether reaches end of data
+   */
+  inline bool Next(Elem &elem) {
+    // end of buffer try to switch
+    if (buf_index == buf_size) {
+      this->SwitchBuffer();
+      buf_index = 0;
+    }
+    if (buf_index >= (current_buf ? endA : endB)) { 
+      return false;
+    }
+    std::vector<Elem> &buf = current_buf ? bufA : bufB;
+    elem = buf[buf_index];
+    ++buf_index;
+    return true;
+  }      
+  /*!
+   * \brief get the factory object
+   */
+  inline ElemFactory &get_factory(void) {
+    return factory;
+  }
+  inline const ElemFactory &get_factory(void) const{
+    return factory;
+  }
+  // size of buffer
+  int  buf_size;
+ private:
+  // factory object used to load configures
+  ElemFactory factory;
+  // index in current buffer
+  int buf_index;
+  // indicate which one is current buffer
+  int current_buf;
+  // max limit of visit, also marks termination
+  int endA, endB;
+  // double buffer, one is accessed by loader
+  // the other is accessed by consumer
+  // buffer of the data
+  std::vector<Elem> bufA, bufB;
+  // initialization end
+  bool init_end;
+  // singal whether the data is loaded
+  bool data_loaded;
+  // signal to kill the thread
+  bool destroy_signal;
+  // thread object
+  Thread loader_thread;
+  // signal of the buffer
+  Semaphore loading_end, loading_need;
+  /*!
+   * \brief slave thread
+   * this implementation is like producer-consumer style
+   */
+  inline void RunLoader(void) {
+    while(!destroy_signal) {
+      // sleep until loading is needed
+      loading_need.Wait();      
+      std::vector<Elem> &buf = current_buf ? bufB : bufA;
+      int i;
+      for (i = 0; i < buf_size ; ++i) {
+        if (!factory.LoadNext(buf[i])) {
+          int &end = current_buf ? endB : endA;
+          end = i; // marks the termination
+          break;
+        }
+      }
+      // signal that loading is done
+      data_loaded = true;
+      loading_end.Post();
+    }
+  }
+  /*!\brief entry point of loader thread */
+  inline static CXXNET_THREAD_PREFIX LoaderEntry(void *pthread) {
+    static_cast< ThreadBuffer<Elem,ElemFactory>* >(pthread)->RunLoader();
+    ThreadExit(NULL);
+    return NULL;
+  }
+  /*!\brief start loader thread */
+  inline void StartLoader(void) {
+    destroy_signal = false;
+    // set param
+    current_buf = 1;    
+    loading_need.Init(1);
+    loading_end .Init(0);
+    // reset terminate limit
+    endA = endB = buf_size;
+    loader_thread.Start(LoaderEntry, this);
+    // wait until first part of data is loaded
+    loading_end.Wait();
+    // set current buf to right value
+    current_buf = 0;
+    // wake loader for next part
+    data_loaded = false;
+    loading_need.Post();    
+    buf_index = 0; 
+  }
+  /*!\brief switch double buffer */
+  inline void SwitchBuffer(void) {
+    loading_end.Wait();
+    // loader shall be sleep now, critcal zone!
+    current_buf = !current_buf;
+    // wake up loader
+    data_loaded = false;
+    loading_need.Post();
+  }
+};
+}  // namespace utils
+}  // namespace cxxnet
+#endif

From 38931a69c8b47477de35c549a992b99d79eab866 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Fri, 4 Sep 2015 01:15:31 +0800
Subject: [PATCH 03/15] pass compilation, not tested

---
 Makefile                      |  23 +++-
 include/mxnet/io.h            |   2 +-
 make/config.mk                |   4 +-
 src/common/utils.h            |   5 +-
 src/io/image_augmenter.h      |  21 ++--
 src/io/inst_vector.h          |  11 +-
 src/io/io.cc                  |  14 ++-
 src/io/iter_batch.h           |  36 +++---
 src/io/iter_image_recordio.cc |  38 ++++---
 src/utils/decoder.h           |   2 +-
 src/utils/io.h                | 175 -----------------------------
 src/utils/thread_buffer.h     | 205 ----------------------------------
 tests/python/test_io.py       |  15 +++
 13 files changed, 106 insertions(+), 445 deletions(-)
 delete mode 100644 src/utils/io.h
 delete mode 100644 src/utils/thread_buffer.h

diff --git a/Makefile b/Makefile
index bdebed0b5ae6..2d980ad74785 100644
--- a/Makefile
+++ b/Makefile
@@ -16,10 +16,15 @@ ifndef RABIT
 	RABIT = rabit
 endif
 
+ifneq ($(USE_OPENMP_ITER), 1)
+	export NO_OPENMP = 1
+endif
+
 # use customized config file
 include $(config)
 include mshadow/make/mshadow.mk
 include $(DMLC_CORE)/make/dmlc.mk
+unexport NO_OPENMP
 
 # all tge possible warning tread
 WARNFLAGS= -Wall
@@ -42,10 +47,21 @@ endif
 
 # setup opencv
 ifeq ($(USE_OPENCV),1)
-	CFLAGS+= -DCXXNET_USE_OPENCV=1
+	CFLAGS+= -DMXNET_USE_OPENCV=1
 	LDFLAGS+= `pkg-config --libs opencv`
 else
-	CFLAGS+= -DCXXNET_USE_OPENCV=0
+	CFLAGS+= -DMXNET_USE_OPENCV=0
+endif
+
+# setup opencv
+ifeq ($(USE_OPENCV_DECODER),1)
+	CFLAGS+= -DMXNET_USE_OPENCV_DECODER=1
+else
+	CFLAGS+= -DMXNET_USE_OPENCV_DECODER=0
+endif
+
+ifeq ($(USE_OPENMP_ITER), 1)
+	CFLAGS += -fopenmp
 endif
 
 ifeq ($(USE_CUDNN), 1)
@@ -64,7 +80,7 @@ endif
 #BIN = test/test_threaded_engine test/api_registry_test
 OBJ = narray_function_cpu.o
 # add threaded engine after it is done
-OBJCXX11 = reshape_cpu.o engine.o narray.o c_api.o operator.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o graph_executor.o softmax_cpu.o elementwise_sum_cpu.o pooling_cpu.o convolution_cpu.o io.o iter_mnist.o
+OBJCXX11 = reshape_cpu.o engine.o narray.o c_api.o operator.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o graph_executor.o softmax_cpu.o elementwise_sum_cpu.o pooling_cpu.o convolution_cpu.o io.o iter_mnist.o iter_image_recordio.o
 CUOBJ =
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
@@ -107,6 +123,7 @@ reshape_cpu.o: src/operator/reshape.cc
 reshape_gpu.o: src/operator/reshape.cu
 io.o: src/io/io.cc
 iter_mnist.o: src/io/iter_mnist.cc
+iter_image_recordio.o: src/io/iter_image_recordio.cc
 
 lib/libmxnet.a: $(OBJ) $(OBJCXX11) $(CUOBJ) $(LIB_DEP)
 lib/libmxnet.so: $(OBJ) $(OBJCXX11) $(CUOBJ) $(LIB_DEP)
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 5a8267befc1c..7bb86f4eece3 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -121,7 +121,7 @@ struct DataIteratorReg
  */
 #define MXNET_REGISTER_IO_CHAINED_ITER(name, ChainedDataIterType, HoldingDataIterType)          \
   static ::mxnet::IIterator<DataBatch>* __create__ ## ChainedDataIteratorType ## __() { \
-    return new HoldingDataIteratorType(new ChainedDataIterType);                                    \
+    return new HoldingDataIterType(new ChainedDataIterType);                                    \
   }                                                                     \
   DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name) \
   .set_body(__create__ ## ChainedDataIteratorType ## __)
diff --git a/make/config.mk b/make/config.mk
index cd04b146180c..3e93e240e493 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -27,8 +27,8 @@ USE_CUDA_PATH = NONE
 # whether use opencv during compilation
 # you can disable it, however, you will not able to use
 # imbin iterator
-USE_OPENCV = 0
-USE_OPENCV_DECODER = 0
+USE_OPENCV = 1
+USE_OPENCV_DECODER = 1
 # whether use CUDNN R3 library
 USE_CUDNN = 0
 # add the path to CUDNN libary to link and compile flag
diff --git a/src/common/utils.h b/src/common/utils.h
index f7a2dcce0470..b5edb78bd6f9 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -21,12 +21,11 @@ namespace common {
  * \brief Random Engine
  */
 typedef std::mt19937 RANDOM_ENGINE;
-
 // Get a double float, prnd is the pointer to a Random Engine
 #define NextDouble(prnd) std::generate_canonical<float, 10>(*prnd) 
+// Get a random int in [0, range)
+#define NextUInt32(range, prnd) static_cast<uint32_t>(floor(std::generate_canonical<float, 10>(*prnd) * range))
 
-#define NextUInt32(range, prnd) static_cast<uint32_t>(\
-        floor(std::generate_canonical<float, 10>(*prnd) * range))
 /*!
  * \brief Helper functions.
  */
diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index d33464c4a889..3ca373d768b0 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -19,7 +19,6 @@ struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
   int crop_y_start_;
   /*! \brief whether we do nonrandom croping */
   int crop_x_start_;
-  /*! \brief Indicate the max ratation angle for augmentation, we will random rotate */
   /*! \brief [-max_rotate_angle, max_rotate_angle] */
   int max_rotate_angle_;
   /*! \brief max aspect ratio */
@@ -77,6 +76,7 @@ struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
         .describe("Rotate angle");
     DMLC_DECLARE_FIELD(fill_value_).set_default(255)
         .describe("Filled value while padding");
+  }
 };
 
 /*! \brief helper class to do image augmentation */
@@ -99,7 +99,7 @@ class ImageAugmenter {
                        << "input_shape must be three consecutive integers without space example: 1,1,200 ";
         }
         if (!strcmp(kwargs_left[i].first.c_str(), "rotate_list")) {
-          char* val = kwargs_left[i].second.c_str();
+          const char* val = kwargs_left[i].second.c_str();
           const char *end = val + strlen(val);
           char buf[128];
           while (val < end) {
@@ -121,9 +121,9 @@ class ImageAugmenter {
   virtual cv::Mat Process(const cv::Mat &src,
                           common::RANDOM_ENGINE *prnd) {
     // shear
-    float s = common::NextDouble(prnd) * param_.max_shear_ratio_ * 2 - param_.max_shear_ratio_;
+    float s = NextDouble(prnd) * param_.max_shear_ratio_ * 2 - param_.max_shear_ratio_;
     // rotate
-    int angle = common::NextUInt32(param_.max_rotate_angle_ * 2, prnd) - param_.max_rotate_angle_;
+    int angle = NextUInt32(param_.max_rotate_angle_ * 2, prnd) - param_.max_rotate_angle_;
     if (param_.rotate_ > 0) angle = param_.rotate_;
     if (rotate_list_.size() > 0) {
       angle = rotate_list_[NextUInt32(rotate_list_.size() - 1, prnd)];
@@ -160,7 +160,7 @@ class ImageAugmenter {
       mshadow::index_t rand_crop_size = NextUInt32(param_.max_crop_size_- param_.min_crop_size_+1, prnd)+ param_.min_crop_size_;
       mshadow::index_t y = res.rows - rand_crop_size;
       mshadow::index_t x = res.cols - rand_crop_size;
-      if (rand_crop_ != 0) {
+      if (param_.rand_crop_ != 0) {
         y = NextUInt32(y + 1, prnd);
         x = NextUInt32(x + 1, prnd);
       }
@@ -171,8 +171,8 @@ class ImageAugmenter {
       cv::resize(res(roi), res, cv::Size(shape_[1], shape_[2]));
     }
     else{
-      utils::Check(static_cast<mshadow::index_t>(res.cols) >= shape_[1] && static_cast<mshadow::index_t>(res.rows) >= shape_[2],
-        "input image size smaller than input shape");
+      CHECK(static_cast<mshadow::index_t>(res.cols) >= shape_[1] && static_cast<mshadow::index_t>(res.rows) >= shape_[2]) 
+          << "input image size smaller than input shape";
       mshadow::index_t y = res.rows - shape_[2];
       mshadow::index_t x = res.cols - shape_[1];
       if (param_.rand_crop_ != 0) {
@@ -240,9 +240,9 @@ class ImageAugmenter {
  private:
   // whether skip processing
   inline bool NeedProcess(void) const {
-    if (max_rotate_angle_ > 0 || max_shear_ratio_ > 0.0f
-        || rotate_ > 0 || rotate_list_.size() > 0) return true;
-    if (min_crop_size_ > 0 && max_crop_size_ > 0) return true;
+    if (param_.max_rotate_angle_ > 0 || param_.max_shear_ratio_ > 0.0f
+        || param_.rotate_ > 0 || rotate_list_.size() > 0) return true;
+    if (param_.min_crop_size_ > 0 && param_.max_crop_size_ > 0) return true;
     return false;
   }
   // temp input space
@@ -252,6 +252,7 @@ class ImageAugmenter {
   // rotation param
   cv::Mat rotateM;
   // parameters
+  ImageAugmentParam param_;
   /*! \brief input shape */
   mshadow::Shape<4> shape_;
   /*! \brief list of possible rotate angle */
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
index 9490ceab94c1..4ced7dd64c63 100644
--- a/src/io/inst_vector.h
+++ b/src/io/inst_vector.h
@@ -7,10 +7,11 @@
 #ifndef MXNET_INST_VECTOR_H_
 #define MXNET_INST_VECTOR_H_
 
-#include "./data.h"
-#include <vector>
+#include <mxnet/io.h>
+#include <mxnet/base.h>
 #include <dmlc/base.h>
 #include <mshadow/tensor.h>
+#include <vector>
 
 namespace mxnet {
 namespace io {
@@ -30,7 +31,7 @@ class TensorVector {
     CHECK(i + 1 < offset_.size());
     CHECK(shape_[i].Size() == offset_[i + 1] - offset_[i]);
     return mshadow::Tensor<cpu, dim, DType>
-        ((DType*)BeginPtr(content_) + offset_[i], shape_[i]);
+        ((DType*)dmlc::BeginPtr(content_) + offset_[i], shape_[i]);
   }
   inline mshadow::Tensor<cpu, dim, DType> Back() const {
     return (*this)[Size() - 1];
@@ -73,8 +74,8 @@ class InstVector {
   inline DataInst operator[](size_t i) const {
     DataInst inst;
     inst.index = index_[i];
-    inst.data = data_[i];
-    inst.label = label_[i];
+    inst.data.push_back(TBlob(data_[i]));
+    inst.data.push_back(TBlob(label_[i]));
     return inst;
   }
   // get back of instance vector
diff --git a/src/io/io.cc b/src/io/io.cc
index 9095f4089c92..b2dbc9f8c2c5 100644
--- a/src/io/io.cc
+++ b/src/io/io.cc
@@ -4,14 +4,18 @@
 
 #include <mxnet/io.h>
 #include <dmlc/registry.h>
-#include <image_augmenter.h>
-#include <>
-#include <iter_batch.h>
+#include "./image_augmenter.h"
+#include "./iter_batch.h"
 
 // Registers
 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::mxnet::DataIteratorReg);
+}  // namespace dmlc
+
+namespace mxnet {
+namespace io {
 // Register parameters in header files
 DMLC_REGISTER_PARAMETER(BatchParam);
-DMLC_REGISTER_PARAMETER(ImageAugmenterParam);
-}  // namespace dmlc
\ No newline at end of file
+DMLC_REGISTER_PARAMETER(ImageAugmentParam);
+}  // namespace mxnet
+}  // namespace io
diff --git a/src/io/iter_batch.h b/src/io/iter_batch.h
index a0e4ab7e7ba5..f258bc2d6afd 100644
--- a/src/io/iter_batch.h
+++ b/src/io/iter_batch.h
@@ -36,7 +36,7 @@ struct BatchParam : public dmlc::Parameter<BatchParam> {
     DMLC_DECLARE_FIELD(test_skipread_).set_default(false)
         .describe("Skip read for testing.");
     DMLC_DECLARE_FIELD(silent_).set_default(false)
-        .describe("Whether to print batch information.")
+        .describe("Whether to print batch information.");
   }
 };
     
@@ -48,7 +48,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
   }
   virtual ~BatchAdaptIter(void) {
     delete base_;
-    out_.FreeSpaceDense();
+    FreeSpaceDense();
   }
   virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
@@ -57,7 +57,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     for (size_t i = 0; i < kwargs_left.size(); i++) {
       if (!strcmp(kwargs_left[i].first.c_str(), "input_shape")) {
         CHECK(sscanf(kwargs_left[i].second.c_str(), "%u,%u,%u", &shape_[1], &shape_[2], &shape_[3]) == 3)
-          << "input_shape must be three consecutive integers without space example: 1,1,200 ")
+          << "input_shape must be three consecutive integers without space example: 1,1,200 ";
       }
     }
     // init base iterator
@@ -88,13 +88,13 @@ class BatchAdaptIter: public IIterator<DataBatch> {
 
     while (base_->Next()) {
       const DataInst& d = base_->Value();
-      mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 2, float>());
+      mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
       out_.inst_index[top] = d.index;
-      mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 4, float>());
+      mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
 
       if (++ top >= param_.batch_size_) {
-        out.data[0] = TBlob(data);
-        out.data[1] = TBlob(label);
+        out_.data[0] = TBlob(data);
+        out_.data[1] = TBlob(label);
         return true;
       }
     }
@@ -105,16 +105,16 @@ class BatchAdaptIter: public IIterator<DataBatch> {
         for (; top < param_.batch_size_; ++top, ++num_overflow_) {
           CHECK(base_->Next()) << "number of input must be bigger than batch size";
           const DataInst& d = base_->Value();
-          mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 2, float>());
+          mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
           out_.inst_index[top] = d.index;
-          mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 4, float>());
+          mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
         }
         out_.num_batch_padd = num_overflow_;
       } else {
-        out_.num_batch_padd = batch_size_ - top;
+        out_.num_batch_padd = param_.batch_size_ - top;
       }
-      out.data[0] = TBlob(data);
-      out.data[1] = TBlob(label);
+      out_.data[0] = TBlob(data);
+      out_.data[1] = TBlob(label);
       return true;
     }
     return false;
@@ -124,6 +124,8 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     return out_;
   }
 private:
+  /*! \brief batch parameters */
+  BatchParam param_;
   /*! \brief base iterator */
   IIterator<DataInst> *base_;
   /*! \brief input shape */
@@ -141,16 +143,16 @@ class BatchAdaptIter: public IIterator<DataBatch> {
   // Functions that allocate and free tensor space
   inline void AllocSpaceDense(bool pad = false) { 
     data = mshadow::NewTensor<mshadow::cpu>(shape_, 0.0f, pad);
-    mshadow::Shape<2> lshape = mshadow::Shape2(batch_size, label_width);
+    mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size_, param_.label_width_);
     label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
-    out_.inst_index = new unsigned[batch_size];
-    out_.batch_size = batch_size;
+    out_.inst_index = new unsigned[param_.batch_size_];
+    out_.batch_size = param_.batch_size_;
     out_.data.resize(2);
   }
   /*! \brief auxiliary function to free space, if needed, dense only */
   inline void FreeSpaceDense(void) {
     if (label.dptr_ != NULL) {
-      delete [] inst_index;
+      delete [] out_.inst_index;
       mshadow::FreeSpace(&label);
       mshadow::FreeSpace(&data);
       label.dptr_ = NULL;
@@ -159,4 +161,4 @@ class BatchAdaptIter: public IIterator<DataBatch> {
 }; // class BatchAdaptIter
 }  // namespace io
 }  // namespace cxxnet
-#endif  // MXNET_IO_ITER_BATCH_H_
\ No newline at end of file
+#endif  // MXNET_IO_ITER_BATCH_H_
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 2ab1aa8958cb..9977ddd2290c 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -16,6 +16,7 @@ iterator
 #include "./inst_vector.h"
 #include "./image_recordio.h"
 #include "./image_augmenter.h"
+#include "./iter_batch.h"
 #include "../utils/decoder.h"
 namespace mxnet {
 namespace io {
@@ -57,7 +58,7 @@ class ImageLabelMap {
     // be careful not to resize label_ afterwards
     idx2label_.reserve(image_index_.size());
     for (size_t i = 0; i < image_index_.size(); ++i) {
-      idx2label_[image_index_[i]] = BeginPtr(label_) + i * label_width_;
+      idx2label_[image_index_[i]] = dmlc::BeginPtr(label_) + i * label_width_;
     }
     if (!silent) {
       LOG(INFO) << "Loaded ImageList from " << path_imglist << ' '
@@ -101,7 +102,7 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
   DMLC_DECLARE_PARAMETER(ImageRecParserParam) {
     DMLC_DECLARE_FIELD(path_imglist_).set_default("")
         .describe("Path to image list.");
-    DMLC_DECLARE_FIELD(path_imagrec_).set_default("./data/imgrec.rec")
+    DMLC_DECLARE_FIELD(path_imgrec_).set_default("./data/imgrec.rec")
         .describe("Path to image record file.");
     DMLC_DECLARE_FIELD(nthread_).set_lower_bound(1).set_default(4)
         .describe("Number of thread to do parsing.");
@@ -178,7 +179,7 @@ inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, s
   // setup decoders
   for (int i = 0; i < threadget; ++i) {
     augmenters_.push_back(new ImageAugmenter());
-    augmenters_[i].init(kwargs_left);
+    augmenters_[i]->Init(kwargs_left);
     prnds_.push_back(new common::RANDOM_ENGINE((i + 1) * kRandMagic));
   }
   
@@ -186,16 +187,16 @@ inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, s
   // TODO, hack
   const char *ps_rank = getenv("PS_RANK");
   if (ps_rank != NULL) {
-    param_.dist_worker_rank = atoi(ps_rank);
+    param_.dist_worker_rank_ = atoi(ps_rank);
   }
 
   if (param_.path_imglist_.length() != 0) {
     label_map_ = new ImageLabelMap(param_.path_imglist_.c_str(),
-                                   param_.label_width_, silent_ != 0);
+                                   param_.label_width_, param_.silent_ != 0);
   } else {
     param_.label_width_ = 1;
   }
-  CHECK(path_imgrec_.length() != 0)
+  CHECK(param_.path_imgrec_.length() != 0)
     << "ImageRecordIOIterator: must specify image_rec";
 #if MSHADOW_DIST_PS
     // TODO move to a better place
@@ -221,8 +222,8 @@ ParseNext(std::vector<InstVector> *out_vec) {
   {
     CHECK(omp_get_num_threads() == param_.nthread_);
     int tid = omp_get_thread_num();
-    dmlc::RecordIOChunkReader reader(chunk, tid, parser_.nthread_);
-    mxnet::ImageRecordIO rec;
+    dmlc::RecordIOChunkReader reader(chunk, tid, param_.nthread_);
+    ImageRecordIO rec;
     dmlc::InputSplit::Blob blob;
     // image data
     InstVector &out = (*out_vec)[tid];
@@ -238,18 +239,21 @@ ParseNext(std::vector<InstVector> *out_vec) {
                mshadow::Shape3(3, res.rows, res.cols),
                mshadow::Shape1(param_.label_width_));
       DataInst inst = out.Back();
+      // turn datainst into tensor
+      mshadow::Tensor<mshadow::cpu, 3> data = inst.data[0].get<mshadow::cpu, 3, float>(); 
+      mshadow::Tensor<mshadow::cpu, 1> label = inst.data[1].get<mshadow::cpu, 1, float>(); 
       for (int i = 0; i < res.rows; ++i) {
         for (int j = 0; j < res.cols; ++j) {
           cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
-          inst.data[0][i][j] = bgr[2];
-          inst.data[1][i][j] = bgr[1];
-          inst.data[2][i][j] = bgr[0];
+          data[0][i][j] = bgr[2];
+          data[1][i][j] = bgr[1];
+          data[2][i][j] = bgr[0];
         }
       }
       if (label_map_ != NULL) {
-        mshadow::Copy(inst.label, label_map_->Find(rec.image_index()));
+        mshadow::Copy(label, label_map_->Find(rec.image_index()));
       } else {
-        inst.label[0] = rec.header.label;
+        label[0] = rec.header.label;
       }
       res.release();
     }
@@ -324,7 +328,7 @@ class ImageRecordIter : public IIterator<DataInst> {
         }
         // shuffle instance order if needed
         if (shuffle_ != 0) {
-            std::shuffle(inst_order_.begin(), inst_.end(), common::RANDOM_ENGINE(kRandMagic + param_.seed));
+            std::shuffle(inst_order_.begin(), inst_order_.end(), common::RANDOM_ENGINE(kRandMagic + param_.seed));
         }
         inst_ptr_ = 0;
       }
@@ -353,17 +357,15 @@ class ImageRecordIter : public IIterator<DataInst> {
   // backend thread
   dmlc::ThreadedIter<std::vector<InstVector> > iter_;
   // parameters
-  ImageRecParserParam param_;
+  ImageRecordParam param_;
 };
 DMLC_REGISTER_PARAMETER(ImageRecParserParam);
 DMLC_REGISTER_PARAMETER(ImageRecordParam);
-MXNET_REGISTER_IO_ITER(MNISTIter, MNISTIter)
 MXNET_REGISTER_IO_CHAINED_ITER(ImageRecordIter, ImageRecordIter, BatchAdaptIter)
     .describe("Create iterator for dataset packed in recordio.")
     .add_arguments(ImageRecordParam::__FIELDS__())
     .add_arguments(ImageRecParserParam::__FIELDS__())
     .add_arguments(BatchParam::__FIELDS__())
-    .add_arguments(ImageAugmenterParam::__FIELDS__());
+    .add_arguments(ImageAugmentParam::__FIELDS__());
 }  // namespace io
 }  // namespace mxnet
-#endif  // ITER_IMAGE_RECORDIO_INL_HPP_
diff --git a/src/utils/decoder.h b/src/utils/decoder.h
index 17203392cc60..52db01edee23 100644
--- a/src/utils/decoder.h
+++ b/src/utils/decoder.h
@@ -13,7 +13,7 @@
   #include <opencv2/opencv.hpp>
 #endif
 
-namespace cxxnet {
+namespace mxnet {
 namespace utils {
 
 #if MXNET_USE_OPENCV_DECODER == 0
diff --git a/src/utils/io.h b/src/utils/io.h
deleted file mode 100644
index 3781ce98b012..000000000000
--- a/src/utils/io.h
+++ /dev/null
@@ -1,175 +0,0 @@
-#ifndef CXXNET_UTILS_IO_H_
-#define CXXNET_UTILS_IO_H_
-/*!
- * \file io.h
- * \brief definition of abstract stream interface for IO
- * \author Bing Xu Tianqi Chen
- */
-#include "./utils.h"
-#include <dmlc/io.h>
-#include <string>
-#include <algorithm>
-#include <cstring>
-
-namespace cxxnet {
-namespace utils {
-typedef dmlc::Stream IStream;
-typedef dmlc::SeekStream ISeekStream;
-
-/*! \brief a in memory buffer that can be read and write as stream interface */
-struct MemoryBufferStream : public ISeekStream {
- public:
-  MemoryBufferStream(std::string *p_buffer)
-      : p_buffer_(p_buffer) {
-    curr_ptr_ = 0;
-  }
-  virtual ~MemoryBufferStream(void) {}
-  virtual size_t Read(void *ptr, size_t size) {
-    CHECK(curr_ptr_ <= p_buffer_->length())
-          << " read can not have position excceed buffer length";
-    size_t nread = std::min(p_buffer_->length() - curr_ptr_, size);
-    if (nread != 0) memcpy(ptr, &(*p_buffer_)[0] + curr_ptr_, nread);
-    curr_ptr_ += nread;
-    return nread;
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    if (size == 0) return;
-    if (curr_ptr_ + size > p_buffer_->length()) {
-      p_buffer_->resize(curr_ptr_+size);
-    }
-    memcpy(&(*p_buffer_)[0] + curr_ptr_, ptr, size);
-    curr_ptr_ += size;
-  }
-  virtual void Seek(size_t pos) {
-    curr_ptr_ = static_cast<size_t>(pos);
-  }
-  virtual size_t Tell(void) {
-    return curr_ptr_;
-  }
-
- private:
-  /*! \brief in memory buffer */
-  std::string *p_buffer_;
-  /*! \brief current pointer */
-  size_t curr_ptr_;
-}; // class MemoryBufferStream
-
-/*! \brief implementation of file i/o stream */
-class StdFile: public ISeekStream {
- public:
-  /*! \brief constructor */
-  StdFile(const char *fname, const char *mode) {
-    Open(fname, mode);
-  }
-  StdFile() {}
-  virtual ~StdFile(void) {
-    this->Close();
-  }
-  virtual void Open(const char *fname, const char *mode) {
-    fp_ = utils::FopenCheck(fname, mode);
-    fseek(fp_, 0L, SEEK_END);
-    sz_ = ftell(fp_);
-    fseek(fp_, 0L, SEEK_SET);
-  }
-  virtual size_t Read(void *ptr, size_t size) {
-    return fread(ptr, size, 1, fp_);
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    fwrite(ptr, size, 1, fp_);
-  }
-  virtual void Seek(size_t pos) {
-    fseek(fp_, pos, SEEK_SET);
-  }
-  virtual size_t Tell(void) {
-    return static_cast<size_t>(ftell(fp_));
-  }
-  inline void Close(void) {
-    if (fp_ != NULL){
-      fclose(fp_); fp_ = NULL;
-    }
-  }
-  inline size_t Size() {
-    return sz_;
-  }
- private:
-  FILE *fp_;
-  size_t sz_;
-}; // class StdFile
-
-/*! \brief Basic page class */
-class BinaryPage {
- public:
-  /*! \brief page size 64 MB */
-  static const size_t kPageSize = 64 << 18;
- public:
-  /*! \brief memory data object */
-  struct Obj{
-    /*! \brief pointer to the data*/
-    void  *dptr;
-    /*! \brief size */
-    size_t sz;
-    Obj(void * dptr, size_t sz) : dptr(dptr), sz(sz){}
-  };
- public:
-  /*! \brief constructor of page */
-  BinaryPage(void)  {
-    data_ = new int[kPageSize];
-    utils::Check(data_ != NULL, "fail to allocate page, out of space");
-    this->Clear();
-  };
-  ~BinaryPage() {
-    if (data_) delete [] data_;
-  }
-  /*!
-   * \brief load one page form instream
-   * \return true if loading is successful
-   */
-  inline bool Load(utils::IStream &fi) {
-    return fi.Read(&data_[0], sizeof(int)*kPageSize) !=0;
-  }
-  /*! \brief save one page into outstream */
-  inline void Save(utils::IStream &fo) {
-    fo.Write(&data_[0], sizeof(int)*kPageSize);
-  }
-  /*! \return number of elements */
-  inline int Size(void){
-    return data_[0];
-  }
-  /*! \brief Push one binary object into page
-   *  \param fname file name of obj need to be pushed into
-   *  \return false or true to push into
-   */
-  inline bool Push(const Obj &dat) {
-    if(this->FreeBytes() < dat.sz + sizeof(int)) return false;
-    data_[ Size() + 2 ] = data_[ Size() + 1 ] + dat.sz;
-    memcpy(this->offset(data_[ Size() + 2 ]), dat.dptr, dat.sz);
-    ++ data_[0];
-    return true;
-  }
-  /*! \brief Clear the page */
-  inline void Clear(void) {
-    memset(&data_[0], 0, sizeof(int) * kPageSize);
-  }
-  /*!
-   * \brief Get one binary object from page
-   *  \param r r th obj in the page
-   */
-  inline Obj operator[](int r) {
-    CHECK(r < Size());
-    return Obj(this->offset(data_[ r + 2 ]),  data_[ r + 2 ] - data_[ r + 1 ]);
-  }
- private:
-  /*! \return number of elements */
-  inline size_t FreeBytes(void) {
-    return (kPageSize - (Size() + 2)) * sizeof(int) - data_[ Size() + 1 ];
-  }
-  inline void* offset(int pos) {
-    return (char*)(&data_[0]) + (kPageSize*sizeof(int) - pos);
-  }
- private:
-  //int data_[ kPageSize ];
-  int *data_;
-};  // class BinaryPage
-}  // namespace utils
-}  // namespace cxxnet
-#endif
diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h
deleted file mode 100644
index 7df1ae17aa56..000000000000
--- a/src/utils/thread_buffer.h
+++ /dev/null
@@ -1,205 +0,0 @@
-#ifndef CXXNET_UTILS_THREAD_BUFFER_H_
-#define CXXNET_UTILS_THREAD_BUFFER_H_
-/*!
- * \file thread_buffer.h
- * \brief  multi-thread buffer, iterator, can be used to create parallel pipeline
- * \author Tianqi Chen
- */
-#include <vector>
-#include <cstring>
-#include <cstdlib>
-#include "./utils.h"
-#include "./thread.h"
-namespace cxxnet {
-namespace utils {
-/*!
- * \brief buffered loading iterator that uses multithread
- * this template method will assume the following paramters
- * \tparam Elem elememt type to be buffered
- * \tparam ElemFactory factory type to implement in order to use thread buffer
- */
-template<typename Elem, typename ElemFactory>
-class ThreadBuffer {
- public:
-  /*!\brief constructor */
-  ThreadBuffer(void) {
-    this->init_end = false;
-    this->buf_size = 30;
-  }
-  ~ThreadBuffer(void) {
-    if(init_end) this->Destroy();
-  }
-  /*!\brief set parameter, will also pass the parameter to factory */
-  inline void SetParam(const char *name, const char *val) {
-    if (!strcmp( name, "buffer_size")) buf_size = atoi(val);
-    factory.SetParam(name, val);
-  }
-  /*!
-   * \brief initalize the buffered iterator
-   * \param param a initialize parameter that will pass to factory, ignore it if not necessary
-   * \return false if the initlization can't be done, e.g. buffer file hasn't been created 
-   */
-  inline bool Init(void) {
-    if (!factory.Init()) return false;
-    bufA.reserve(buf_size);
-    bufB.reserve(buf_size);
-    for (int i = 0; i < buf_size; ++i) {
-      bufA.push_back(factory.Create());
-      bufB.push_back(factory.Create());
-    }
-    this->init_end = true;
-    this->StartLoader();
-    return true;
-  }  
-  /*!\brief place the iterator before first value */
-  inline void BeforeFirst(void) {
-    // wait till last loader end
-    loading_end.Wait();
-    // critcal zone
-    current_buf = 1;
-    factory.BeforeFirst();
-    // reset terminate limit
-    endA = endB = buf_size;
-    // wake up loader for first part
-    loading_need.Post();
-    // wait til first part is loaded
-    loading_end.Wait();
-    // set current buf to right value
-    current_buf = 0;
-    // wake loader for next part
-    data_loaded = false;
-    loading_need.Post();
-    // set buffer value
-    buf_index = 0;
-  }  
-  /*! \brief destroy the buffer iterator, will deallocate the buffer */
-  inline void Destroy(void) {
-    // wait until the signal is consumed
-    this->destroy_signal = true;
-    loading_need.Post();
-    loader_thread.Join();
-    loading_need.Destroy();
-    loading_end.Destroy();    
-    for (size_t i = 0; i < bufA.size(); ++i) {
-      factory.FreeSpace(bufA[i]);
-    }
-    for (size_t i = 0; i < bufB.size(); ++i) {
-      factory.FreeSpace(bufB[i]);
-    }
-    bufA.clear(); bufB.clear();
-    factory.Destroy();
-    this->init_end = false;
-  }  
-  /*!
-   * \brief get the next element needed in buffer
-   * \param elem element to store into
-   * \return whether reaches end of data
-   */
-  inline bool Next(Elem &elem) {
-    // end of buffer try to switch
-    if (buf_index == buf_size) {
-      this->SwitchBuffer();
-      buf_index = 0;
-    }
-    if (buf_index >= (current_buf ? endA : endB)) { 
-      return false;
-    }
-    std::vector<Elem> &buf = current_buf ? bufA : bufB;
-    elem = buf[buf_index];
-    ++buf_index;
-    return true;
-  }      
-  /*!
-   * \brief get the factory object
-   */
-  inline ElemFactory &get_factory(void) {
-    return factory;
-  }
-  inline const ElemFactory &get_factory(void) const{
-    return factory;
-  }
-  // size of buffer
-  int  buf_size;
- private:
-  // factory object used to load configures
-  ElemFactory factory;
-  // index in current buffer
-  int buf_index;
-  // indicate which one is current buffer
-  int current_buf;
-  // max limit of visit, also marks termination
-  int endA, endB;
-  // double buffer, one is accessed by loader
-  // the other is accessed by consumer
-  // buffer of the data
-  std::vector<Elem> bufA, bufB;
-  // initialization end
-  bool init_end;
-  // singal whether the data is loaded
-  bool data_loaded;
-  // signal to kill the thread
-  bool destroy_signal;
-  // thread object
-  Thread loader_thread;
-  // signal of the buffer
-  Semaphore loading_end, loading_need;
-  /*!
-   * \brief slave thread
-   * this implementation is like producer-consumer style
-   */
-  inline void RunLoader(void) {
-    while(!destroy_signal) {
-      // sleep until loading is needed
-      loading_need.Wait();      
-      std::vector<Elem> &buf = current_buf ? bufB : bufA;
-      int i;
-      for (i = 0; i < buf_size ; ++i) {
-        if (!factory.LoadNext(buf[i])) {
-          int &end = current_buf ? endB : endA;
-          end = i; // marks the termination
-          break;
-        }
-      }
-      // signal that loading is done
-      data_loaded = true;
-      loading_end.Post();
-    }
-  }
-  /*!\brief entry point of loader thread */
-  inline static CXXNET_THREAD_PREFIX LoaderEntry(void *pthread) {
-    static_cast< ThreadBuffer<Elem,ElemFactory>* >(pthread)->RunLoader();
-    ThreadExit(NULL);
-    return NULL;
-  }
-  /*!\brief start loader thread */
-  inline void StartLoader(void) {
-    destroy_signal = false;
-    // set param
-    current_buf = 1;    
-    loading_need.Init(1);
-    loading_end .Init(0);
-    // reset terminate limit
-    endA = endB = buf_size;
-    loader_thread.Start(LoaderEntry, this);
-    // wait until first part of data is loaded
-    loading_end.Wait();
-    // set current buf to right value
-    current_buf = 0;
-    // wake loader for next part
-    data_loaded = false;
-    loading_need.Post();    
-    buf_index = 0; 
-  }
-  /*!\brief switch double buffer */
-  inline void SwitchBuffer(void) {
-    loading_end.Wait();
-    // loader shall be sleep now, critcal zone!
-    current_buf = !current_buf;
-    // wake up loader
-    data_loaded = false;
-    loading_need.Post();
-  }
-};
-}  // namespace utils
-}  // namespace cxxnet
-#endif
diff --git a/tests/python/test_io.py b/tests/python/test_io.py
index dfeb3f67c293..991a4813033e 100644
--- a/tests/python/test_io.py
+++ b/tests/python/test_io.py
@@ -39,3 +39,18 @@ def test_MNISTIter_reset():
     label_1 = train_dataiter.getlabel().numpy.flatten()
     assert(sum(label_0 - label_1) == 0)
 
+def test_ImageRecIter():
+    dataiter = mx.io.ImageRecordIter(path_imgrec="data/val_cxxnet.rec",
+            image_mean="data/val_cxxnet_mean.bin",
+            rand_crop=True,
+            rand_mirror=True,
+            input_shape="3,224,224",
+            batch_size=128)
+
+
+
+
+
+
+
+

From c1603d77168d62cccd760c974ae8bd251808b08a Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Sun, 6 Sep 2015 07:47:04 +0800
Subject: [PATCH 04/15] merge augmenter, modify param attribute

---
 src/io/image_augmenter.h      | 279 +++++++++++++++++++++++++---------
 src/io/iter_batch.h           |  55 ++++---
 src/io/iter_image_recordio.cc |  76 ++++-----
 3 files changed, 275 insertions(+), 135 deletions(-)

diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index 3ca373d768b0..a81e5297d5b3 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -1,7 +1,7 @@
 /*!
  * \file image_augmenter_opencv.hpp
  * \brief threaded version of page iterator
- * \author Naiyan Wang, Tianqi Chen
+ * \author Naiyan Wang, Tianqi Chen, Tianjun Xiao
  */
 #ifndef MXNET_IO_IMAGE_AUGMENTER_H_
 #define MXNET_IO_IMAGE_AUGMENTER_H_
@@ -14,68 +14,102 @@ namespace io {
 /*! \brief image augmentation parameters*/
 struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
   /*! \brief whether we do random cropping */
-  bool rand_crop_;
+  bool rand_crop;
   /*! \brief whether we do nonrandom croping */
-  int crop_y_start_;
+  int crop_y_start;
   /*! \brief whether we do nonrandom croping */
-  int crop_x_start_;
+  int crop_x_start;
   /*! \brief [-max_rotate_angle, max_rotate_angle] */
-  int max_rotate_angle_;
+  int max_rotate_angle;
   /*! \brief max aspect ratio */
-  float max_aspect_ratio_;
+  float max_aspect_ratio;
   /*! \brief random shear the image [-max_shear_ratio, max_shear_ratio] */
-  float max_shear_ratio_;
+  float max_shear_ratio;
   /*! \brief max crop size */
-  int max_crop_size_;
+  int max_crop_size;
   /*! \brief min crop size */
-  int min_crop_size_;
+  int min_crop_size;
   /*! \brief max scale ratio */
-  float max_random_scale_;
+  float max_random_scale;
   /*! \brief min scale_ratio */
-  float min_random_scale_;
+  float min_random_scale;
   /*! \brief min image size */
-  float min_img_size_;
+  float min_img_size;
   /*! \brief max image size */
-  float max_img_size_;
-  /*! \brief whether to mirror the image */
-  bool mirror_;
+  float max_img_size;
   /*! \brief rotate angle */
-  int rotate_;
+  int rotate;
   /*! \brief filled color while padding */
-  int fill_value_;
+  int fill_value;
+  /*! \brief whether to mirror the image */
+  bool mirror;
+  /*! \brief whether to perform rand mirror the image */
+  bool rand_mirror;
+  /*! \brief mean file string*/
+  std::string mean_img;
+  /*! \brief mean value for r channel */
+  float mean_r;
+  /*! \brief mean value for g channel */
+  float mean_g;
+  /*! \brief mean value for b channel */
+  float mean_b;
+  /*! \brief shape of the image data*/
+  TShape input_shape;
+  /*! \brief maximum ratio of contrast variation */
+  float max_random_contrast_;
+  /*! \brief maximum value of illumination variation */
+  float max_random_illumination_;
   // declare parameters
   // TODO: didn't understand the range for some params
   DMLC_DECLARE_PARAMETER(ImageAugmentParam) {
     DMLC_DECLARE_FIELD(rand_crop_).set_default(true)
         .describe("Whether we de random cropping");
-    DMLC_DECLARE_FIELD(crop_y_start_).set_default(-1)
+    DMLC_DECLARE_FIELD(crop_y_start).set_default(-1)
         .describe("Where to nonrandom crop on y");
-    DMLC_DECLARE_FIELD(crop_x_start_).set_default(-1)
+    DMLC_DECLARE_FIELD(crop_x_start).set_default(-1)
         .describe("Where to nonrandom crop on x");
-    DMLC_DECLARE_FIELD(max_rotate_angle_).set_default(0.0f)
+    DMLC_DECLARE_FIELD(max_rotate_angle).set_default(0.0f)
         .describe("Rotate can be [-max_rotate_angle, max_rotate_angle]");
-    DMLC_DECLARE_FIELD(max_aspect_ratio_).set_default(0.0f)
+    DMLC_DECLARE_FIELD(max_aspect_ratio).set_default(0.0f)
         .describe("Max aspect ratio");
-    DMLC_DECLARE_FIELD(max_shear_ratio_).set_default(0.0f)
+    DMLC_DECLARE_FIELD(max_shear_ratio).set_default(0.0f)
         .describe("Shear rotate can be made between [-max_shear_ratio_, max_shear_ratio_]");
-    DMLC_DECLARE_FIELD(max_crop_size_).set_default(-1)
+    DMLC_DECLARE_FIELD(max_crop_size).set_default(-1)
         .describe("Maximum crop size");
-    DMLC_DECLARE_FIELD(min_crop_size_).set_default(-1)
+    DMLC_DECLARE_FIELD(min_crop_size).set_default(-1)
         .describe("Minimum crop size");
-    DMLC_DECLARE_FIELD(max_random_scale_).set_default(1.0f)
+    DMLC_DECLARE_FIELD(max_random_scale).set_default(1.0f)
         .describe("Maxmum scale ratio");
-    DMLC_DECLARE_FIELD(min_random_scale_).set_default(1.0f)
+    DMLC_DECLARE_FIELD(min_random_scale).set_default(1.0f)
         .describe("Minimum scale ratio");       
-    DMLC_DECLARE_FIELD(max_img_size_).set_default(1e10f)
+    DMLC_DECLARE_FIELD(max_img_size).set_default(1e10f)
         .describe("Maxmum image size");
-    DMLC_DECLARE_FIELD(min_img_size_).set_default(0.0f)
+    DMLC_DECLARE_FIELD(min_img_size).set_default(0.0f)
         .describe("Minimum image size");
-    DMLC_DECLARE_FIELD(mirror_).set_default(false)
-        .describe("Whether to mirror the image");
-    DMLC_DECLARE_FIELD(rotate_).set_default(-1.0f)
+    DMLC_DECLARE_FIELD(rotate).set_default(-1.0f)
         .describe("Rotate angle");
-    DMLC_DECLARE_FIELD(fill_value_).set_default(255)
+    DMLC_DECLARE_FIELD(fill_value).set_default(255)
         .describe("Filled value while padding");
+    DMLC_DECLARE_FIELD(mirror).set_default(false)
+        .describe("Whether to mirror the image");
+    DMLC_DECLARE_FIELD(rand_mirror).set_default(false)
+        .describe("Whether to mirror the image randomly");
+    DMLC_DECLARE_FIELD(mean_img).set_default("")
+        .describe("Mean Image to be subtracted");
+    DMLC_DECLARE_FIELD(mean_r).set_default(0.0f)
+        .describe("Mean value on R channel");
+    DMLC_DECLARE_FIELD(mean_g).set_default(0.0f)
+        .describe("Mean value on G channel");   
+    DMLC_DECLARE_FIELD(mean_b).set_default(0.0f)
+        .describe("Mean value on B channel");
+    float input_shape_default = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape).set_default(TShape(input_shape_default, input_shape_default + 3))
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("Input shape of the neural net");
+    DMLC_DECLARE_FIELD(max_random_contrast).set_default(0.0f)
+        .describe("Maximum ratio of contrast variation");
+    DMLC_DECLARE_FIELD(max_random_illumination).set_default(0.0f)
+        .describe("Maximum value of illumination variation");
   }
 };
 
@@ -84,8 +118,8 @@ class ImageAugmenter {
  public:
   // contructor
   ImageAugmenter(void)
-      : tmpres(false),
-        rotateM(2, 3, CV_32F) {
+      : tmpres_(false),
+        rotateM_(2, 3, CV_32F) {
   }
   virtual ~ImageAugmenter() {
   }
@@ -94,10 +128,6 @@ class ImageAugmenter {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     kwargs_left = param_.InitAllowUnknown(kwargs);
     for (size_t i = 0; i < kwargs_left.size(); i++) {
-        if (!strcmp(kwargs_left[i].first.c_str(), "input_shape")) {
-          CHECK(sscanf(kwargs_left[i].second.c_str(), "%u,%u,%u", &shape_[0], &shape_[1], &shape_[2]) == 3)
-                       << "input_shape must be three consecutive integers without space example: 1,1,200 ";
-        }
         if (!strcmp(kwargs_left[i].first.c_str(), "rotate_list")) {
           const char* val = kwargs_left[i].second.c_str();
           const char *end = val + strlen(val);
@@ -109,6 +139,19 @@ class ImageAugmenter {
           }
         }
     }
+    if (param_.mean_img.length() != 0) {
+      dmlc::Stream *fi = dmlc::Stream::Create(param_.mean_img.c_str(), "r", true);
+      if (fi == NULL) {
+        this->CreateMeanImg();
+      } else {
+        if (param_.silent == 0) {
+          printf("loading mean image from %s\n", param_.mean_img.c_str());
+        }
+        meanimg_.LoadBinary(*fi);
+        delete fi;
+        meanfile_ready_ = true;
+      }
+    }
   }
   /*!
    * \brief augment src image, store result into dst
@@ -118,27 +161,27 @@ class ImageAugmenter {
    * \param source of random number
    * \param dst the pointer to the place where we want to store the result
    */
-  virtual cv::Mat Process(const cv::Mat &src,
+  virtual cv::Mat OpencvProcess(const cv::Mat &src,
                           common::RANDOM_ENGINE *prnd) {
     // shear
-    float s = NextDouble(prnd) * param_.max_shear_ratio_ * 2 - param_.max_shear_ratio_;
+    float s = NextDouble(prnd) * param_.max_shear_ratio * 2 - param_.max_shear_ratio;
     // rotate
-    int angle = NextUInt32(param_.max_rotate_angle_ * 2, prnd) - param_.max_rotate_angle_;
-    if (param_.rotate_ > 0) angle = param_.rotate_;
+    int angle = NextUInt32(param_.max_rotate_angle * 2, prnd) - param_.max_rotate_angle;
+    if (param_.rotate > 0) angle = param_.rotate;
     if (rotate_list_.size() > 0) {
       angle = rotate_list_[NextUInt32(rotate_list_.size() - 1, prnd)];
     }
     float a = cos(angle / 180.0 * M_PI);
     float b = sin(angle / 180.0 * M_PI);
     // scale
-    float scale = NextDouble(prnd) * (param_.max_random_scale_ - param_.min_random_scale_) + param_.min_random_scale_;
+    float scale = NextDouble(prnd) * (param_.max_random_scale - param_.min_random_scale) + param_.min_random_scale;
     // aspect ratio
-    float ratio = NextDouble(prnd) * param_.max_aspect_ratio_ * 2 - param_.max_aspect_ratio_ + 1;
+    float ratio = NextDouble(prnd) * param_.max_aspect_ratio * 2 - param_.max_aspect_ratio + 1;
     float hs = 2 * scale / (1 + ratio);
     float ws = ratio * hs;
     // new width and height
-    float new_width = std::max(param_.min_img_size_, std::min(param_.max_img_size_, scale * src.cols));
-    float new_height = std::max(param_.min_img_size_, std::min(param_.max_img_size_, scale * src.rows));
+    float new_width = std::max(param_.min_img_size, std::min(param_.max_img_size, scale * src.cols));
+    float new_height = std::max(param_.min_img_size, std::min(param_.max_img_size, scale * src.rows));
     //printf("%f %f %f %f %f %f %f %f %f\n", s, a, b, scale, ratio, hs, ws, new_width, new_height);
     cv::Mat M(2, 3, CV_32F);
     M.at<float>(0, 0) = hs * a - s * b * ws;
@@ -152,15 +195,16 @@ class ImageAugmenter {
     cv::warpAffine(src, temp, M, cv::Size(new_width, new_height),
                      cv::INTER_LINEAR,
                      cv::BORDER_CONSTANT,
-                     cv::Scalar(param_.fill_value_, param_.fill_value_, param_.fill_value_));
+                     cv::Scalar(param_.fill_value, param_.fill_value, param_.fill_value));
     cv::Mat res = temp;
-    if (param_.max_crop_size_ != -1 || param_.min_crop_size_ != -1){
-      CHECK(res.cols >= param_.max_crop_size_ && res.rows >= param_.max_crop_size_&& param_.max_crop_size_ >= param_.min_crop_size_)
+    // crop
+    if (param_.max_crop_size != -1 || param_.min_crop_size != -1){
+      CHECK(res.cols >= param_.max_crop_size && res.rows >= param_.max_crop_size && param_.max_crop_size >= param_.min_crop_size)
           << "input image size smaller than max_crop_size";
-      mshadow::index_t rand_crop_size = NextUInt32(param_.max_crop_size_- param_.min_crop_size_+1, prnd)+ param_.min_crop_size_;
+      mshadow::index_t rand_crop_size = NextUInt32(param_.max_crop_size- param_.min_crop_size+1, prnd)+ param_.min_crop_size;
       mshadow::index_t y = res.rows - rand_crop_size;
       mshadow::index_t x = res.cols - rand_crop_size;
-      if (param_.rand_crop_ != 0) {
+      if (param_.rand_crop != 0) {
         y = NextUInt32(y + 1, prnd);
         x = NextUInt32(x + 1, prnd);
       }
@@ -168,13 +212,13 @@ class ImageAugmenter {
         y /= 2; x /= 2;
       }
       cv::Rect roi(x, y, rand_crop_size, rand_crop_size);
-      cv::resize(res(roi), res, cv::Size(shape_[1], shape_[2]));
+      cv::resize(res(roi), res, cv::Size(param_.input_shape[1], param_.input_shape[2]));
     }
     else{
-      CHECK(static_cast<mshadow::index_t>(res.cols) >= shape_[1] && static_cast<mshadow::index_t>(res.rows) >= shape_[2]) 
+      CHECK(static_cast<mshadow::index_t>(res.cols) >= param_.input_shape[1] && static_cast<mshadow::index_t>(res.rows) >= param_.input_shape[2]) 
           << "input image size smaller than input shape";
-      mshadow::index_t y = res.rows - shape_[2];
-      mshadow::index_t x = res.cols - shape_[1];
+      mshadow::index_t y = res.rows - param_.input_shape[2];
+      mshadow::index_t x = res.cols - param_.input_shape[1];
       if (param_.rand_crop_ != 0) {
         y = NextUInt32(y + 1, prnd);
         x = NextUInt32(x + 1, prnd);
@@ -182,7 +226,7 @@ class ImageAugmenter {
       else {
         y /= 2; x /= 2;
       }
-      cv::Rect roi(x, y, shape_[1], shape_[2]);
+      cv::Rect roi(x, y, param_.input_shape[1], param_.input_shape[2]);
       res = res(roi);
     }
     return res;
@@ -195,9 +239,9 @@ class ImageAugmenter {
    * \param source of random number
    * \param dst the pointer to the place where we want to store the result
    */
-  virtual mshadow::Tensor<cpu, 3> Process(mshadow::Tensor<cpu, 3> data,
+  virtual mshadow::Tensor<cpu, 3> OpencvProcess(mshadow::Tensor<cpu, 3> data,
                                           common::RANDOM_ENGINE *prnd) {
-    if (!NeedProcess()) return data;
+    if (!NeedOpencvProcess()) return data;
     cv::Mat res(data.size(1), data.size(2), CV_8UC3);
     for (index_t i = 0; i < data.size(1); ++i) {
       for (index_t j = 0; j < data.size(2); ++j) {
@@ -206,7 +250,7 @@ class ImageAugmenter {
         res.at<cv::Vec3b>(i, j)[2] = data[0][i][j];
       }
     }
-    res = this->Process(res, prnd);
+    res = this->OpencvProcess(res, prnd);
     tmpres.Resize(mshadow::Shape3(3, res.rows, res.cols));
     for (index_t i = 0; i < tmpres.size(1); ++i) {
       for (index_t j = 0; j < tmpres.size(2); ++j) {
@@ -219,12 +263,12 @@ class ImageAugmenter {
     return tmpres;
   }
 
-  virtual void Process(unsigned char *dptr, size_t sz,
+  virtual void OpencvProcess(unsigned char *dptr, size_t sz,
                        mshadow::TensorContainer<cpu, 3> *p_data,
                        common::RANDOM_ENGINE *prnd) {
     cv::Mat buf(1, sz, CV_8U, dptr);
     cv::Mat res = cv::imdecode(buf, 1);
-    res = this->Process(res, prnd);
+    res = this->OpencvProcess(res, prnd);
     p_data->Resize(mshadow::Shape3(3, res.rows, res.cols));
     for (index_t i = 0; i < p_data->size(1); ++i) {
       for (index_t j = 0; j < p_data->size(2); ++j) {
@@ -237,20 +281,117 @@ class ImageAugmenter {
     res.release();
   }
 
+  void TensorProcess(mshadow::TensorContainer<cpu, 3> *p_data,
+                       common::RANDOM_ENGINE *prnd) {
+    img_.Resize(mshadow::Shape3((*p_data).shape_[0], param_.input_shape[1], param_.input_shape[2]));
+    if (param_.input_shape[1] == 1) {
+      img_ = (*p_data) * param_.scale;
+    } else {
+      CHECK(p_data->size(1) >= param_.input_shape[1] && p_data->size(2) >= param_.input_shape[2])
+          << "Data size must be bigger than the input size to net.";
+      mshadow::index_t yy = p_data->size(1) - param_.input_shape[1];
+      mshadow::index_t xx = p_data->size(2) - param_.input_shape[2];
+      if (param_.rand_crop != 0 && (yy != 0 || xx != 0)) {
+        yy = NextUInt32(yy + 1, prnd);
+        xx = NextUInt32(xx + 1, prnd);
+      } else {
+        yy /= 2; xx /= 2;
+      }
+      if (p_data->size(1) != param_.input_shape[1] && param_.crop_y_start != -1) {
+        yy = param_.crop_y_start;
+      }
+      if (p_data->size(2) != param_.input_shape[2] && param_.crop_x_start != -1) {
+        xx = param_.crop_x_start;
+      }
+      float contrast = NextDouble(prnd) * param_.max_random_contrast * 2 - param_.max_random_contrast + 1;
+      float illumination = NextDouble(prnd) * param_.max_random_illumination * 2 - param_.max_random_illumination;
+      if (param_.mean_r > 0.0f || param_.mean_g > 0.0f || param_.mean_b > 0.0f) {
+        // substract mean value
+        (*p_data)[0] -= param_.mean_b; (*p_data)[1] -= param_.mean_g; (*p_data)[2] -= param_.mean_r;
+        if ((param_.rand_mirror != 0 && NextDouble(rnd) < 0.5f) || param_.mirror == 1) {
+          img_ = mirror(crop((*p_data) * contrast + illumination, img_[0].shape_, yy, xx)) * param_.scale;
+        } else {
+          img_ = crop((*p_data) * contrast + illumination, img_[0].shape_, yy, xx) * param_.scale ;
+        }
+      } else if (!meanfile_ready_ || param_.mean_img.length() == 0) {
+        // do not substract anything
+        if (param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) {
+          img_ = mirror(crop((*p_data), img_[0].shape_, yy, xx)) * param_.scale;
+        } else {
+          img_ = crop((*p_data), img_[0].shape_, yy, xx) * param_.scale ;
+        }
+      } else {
+        // substract mean image
+        if ((param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) || param_.mirror == 1) {
+          if (p_data->shape_ == meanimg_.shape_) {
+            img_ = mirror(crop(((*p_data) - meanimg_) * contrast + illumination, img_[0].shape_, yy, xx)) * param_.scale;
+          } else {
+            img_ = (mirror(crop((*p_data), img_[0].shape_, yy, xx) - meanimg_) * contrast + illumination) * param_.scale;
+          }
+        } else {
+          if (p_data->shape_ == meanimg_.shape_){
+            img_ = crop(((*p_data) - meanimg_) * contrast + illumination, img_[0].shape_, yy, xx) * param_.scale;
+          } else {
+            img_ = ((crop((*p_data), img_[0].shape_, yy, xx) - meanimg_) * contrast + illumination) * param_.scale;
+          }
+        }
+      }
+    }
+    out_.data = img_;
+  } 
+
+  inline void CreateMeanImg(void) {
+    if (silent_ == 0) {
+      printf("cannot find %s: create mean image, this will take some time...\n", name_meanimg_.c_str());
+    }
+    time_t start = time(NULL);
+    unsigned long elapsed = 0;
+    size_t imcnt = 1;
+
+    CHECK(this->Next_()) << "input iterator failed.";
+    meanimg_.Resize(mshadow::Shape3(shape_[0], shape_[1], shape_[2]));
+    mshadow::Copy(meanimg_, img_);
+    while (this->Next()) {
+      meanimg_ += img_; imcnt += 1;
+      elapsed = (long)(time(NULL) - start);
+      if (imcnt % 1000 == 0 && silent_ == 0) {
+        printf("\r                                                               \r");
+        printf("[%8lu] images processed, %ld sec elapsed", imcnt, elapsed);
+        fflush(stdout);
+      }
+    }
+    meanimg_ *= (1.0f / imcnt);
+
+    dmlc::Stream *fo = dmlc::Stream::Create(name_meanimg_.c_str(), "w");
+    meanimg_.SaveBinary(*fo);
+    delete fo;
+    if (silent_ == 0) {
+      printf("save mean image to %s..\n", name_meanimg_.c_str());
+    }
+    meanfile_ready_ = true;
+  }
+
+
  private:
-  // whether skip processing
-  inline bool NeedProcess(void) const {
-    if (param_.max_rotate_angle_ > 0 || param_.max_shear_ratio_ > 0.0f
-        || param_.rotate_ > 0 || rotate_list_.size() > 0) return true;
-    if (param_.min_crop_size_ > 0 && param_.max_crop_size_ > 0) return true;
+  // whether skip opencv processing
+  inline bool NeedOpencvProcess(void) const {
+    if (param_.max_rotate_angle > 0 || param_.max_shear_ratio > 0.0f
+        || param_.rotate > 0 || rotate_list_.size() > 0) return true;
+    if (param_.min_crop_size > 0 && param_.max_crop_size > 0) return true;
     return false;
   }
   // temp input space
-  mshadow::TensorContainer<cpu, 3> tmpres;
+  mshadow::TensorContainer<cpu, 3> tmpres_;
+  // mean image
+  mshadow::TensorContainer<cpu, 3> meanimg_;
+  /*! \brief temp space */
+  mshadow::TensorContainer<cpu, 3> img_;
   // temporal space
-  cv::Mat temp0, temp, temp2;
+  cv::Mat temp_;
   // rotation param
-  cv::Mat rotateM;
+  cv::Mat rotateM_;
+  // whether the mean file is ready
+  bool menafile_ready_;
   // parameters
   ImageAugmentParam param_;
   /*! \brief input shape */
diff --git a/src/io/iter_batch.h b/src/io/iter_batch.h
index f258bc2d6afd..4d95b92cce1e 100644
--- a/src/io/iter_batch.h
+++ b/src/io/iter_batch.h
@@ -16,26 +16,33 @@ namespace io {
 // Batch parameters
 struct BatchParam : public dmlc::Parameter<BatchParam> {
   /*! \brief label width */
-  index_t batch_size_;
+  index_t batch_size;
+  /*! \brief input shape */
+  // TODO: haven't modify all shape_
+  TShape input_shape;
   /*! \brief label width */
-  index_t label_width_;
+  index_t label_width;
   /*! \brief use round roubin to handle overflow batch */
-  bool round_batch_;
+  bool round_batch;
   /*! \brief skip read */
-  bool test_skipread_;
+  bool test_skipread;
   /*! \brief silent */
-  bool silent_;
+  bool silent;
   // declare parameters
   DMLC_DECLARE_PARAMETER(BatchParam) {
-    DMLC_DECLARE_FIELD(batch_size_).set_default(1)
+    DMLC_DECLARE_FIELD(batch_size)
         .describe("Batch size.");
-    DMLC_DECLARE_FIELD(label_width_).set_default(1)
+    float input_shape_default = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape).set_default(TShape(input_shape_default, input_shape_default + 3))
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("Input shape of the neural net");   
+    DMLC_DECLARE_FIELD(label_width).set_default(1)
         .describe("Label width.");
-    DMLC_DECLARE_FIELD(round_batch_).set_default(false)
+    DMLC_DECLARE_FIELD(round_batch).set_default(false)
         .describe("Use round robin to handle overflow batch.");
-    DMLC_DECLARE_FIELD(test_skipread_).set_default(false)
+    DMLC_DECLARE_FIELD(test_skipread).set_default(false)
         .describe("Skip read for testing.");
-    DMLC_DECLARE_FIELD(silent_).set_default(false)
+    DMLC_DECLARE_FIELD(silent).set_default(false)
         .describe("Whether to print batch information.");
   }
 };
@@ -54,20 +61,14 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     // init batch param, it could have similar param with 
     kwargs_left = param_.InitAllowUnknown(kwargs);
-    for (size_t i = 0; i < kwargs_left.size(); i++) {
-      if (!strcmp(kwargs_left[i].first.c_str(), "input_shape")) {
-        CHECK(sscanf(kwargs_left[i].second.c_str(), "%u,%u,%u", &shape_[1], &shape_[2], &shape_[3]) == 3)
-          << "input_shape must be three consecutive integers without space example: 1,1,200 ";
-      }
-    }
     // init base iterator
     base_->Init(kwargs);
     mshadow::Shape<4> tshape = shape_;
-    tshape[0] = param_.batch_size_;
+    tshape[0] = param_.batch_size;
     AllocSpaceDense(false);
   }
   virtual void BeforeFirst(void) {
-    if (param_.round_batch_ == 0 || num_overflow_ == 0) {
+    if (param_.round_batch == 0 || num_overflow_ == 0) {
       // otherise, we already called before first
       base_->BeforeFirst();
     } else {
@@ -79,7 +80,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     out_.num_batch_padd = 0;
 
     // skip read if in head version
-    if (param_.test_skipread_ != 0 && head_ == 0) return true;
+    if (param_.test_skipread != 0 && head_ == 0) return true;
     else this->head_ = 0;
 
     // if overflow from previous round, directly return false, until before first is called
@@ -92,17 +93,17 @@ class BatchAdaptIter: public IIterator<DataBatch> {
       out_.inst_index[top] = d.index;
       mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
 
-      if (++ top >= param_.batch_size_) {
+      if (++ top >= param_.batch_size) {
         out_.data[0] = TBlob(data);
         out_.data[1] = TBlob(label);
         return true;
       }
     }
     if (top != 0) {
-      if (param_.round_batch_ != 0) {
+      if (param_.round_batch != 0) {
         num_overflow_ = 0;
         base_->BeforeFirst();
-        for (; top < param_.batch_size_; ++top, ++num_overflow_) {
+        for (; top < param_.batch_size; ++top, ++num_overflow_) {
           CHECK(base_->Next()) << "number of input must be bigger than batch size";
           const DataInst& d = base_->Value();
           mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
@@ -111,7 +112,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
         }
         out_.num_batch_padd = num_overflow_;
       } else {
-        out_.num_batch_padd = param_.batch_size_ - top;
+        out_.num_batch_padd = param_.batch_size - top;
       }
       out_.data[0] = TBlob(data);
       out_.data[1] = TBlob(label);
@@ -128,8 +129,6 @@ class BatchAdaptIter: public IIterator<DataBatch> {
   BatchParam param_;
   /*! \brief base iterator */
   IIterator<DataInst> *base_;
-  /*! \brief input shape */
-  mshadow::Shape<4> shape_;
   /*! \brief output data */
   DataBatch out_;
   /*! \brief on first */
@@ -143,10 +142,10 @@ class BatchAdaptIter: public IIterator<DataBatch> {
   // Functions that allocate and free tensor space
   inline void AllocSpaceDense(bool pad = false) { 
     data = mshadow::NewTensor<mshadow::cpu>(shape_, 0.0f, pad);
-    mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size_, param_.label_width_);
+    mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size, param_.label_width);
     label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
-    out_.inst_index = new unsigned[param_.batch_size_];
-    out_.batch_size = param_.batch_size_;
+    out_.inst_index = new unsigned[param_.batch_size];
+    out_.batch_size = param_.batch_size;
     out_.data.resize(2);
   }
   /*! \brief auxiliary function to free space, if needed, dense only */
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 9977ddd2290c..1589fd5ad6c7 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -31,7 +31,7 @@ class ImageLabelMap {
   explicit ImageLabelMap(const char *path_imglist,
                          mshadow::index_t label_width,
                          bool silent) {
-    label_width_ = label_width;
+    label_width = label_width;
     image_index_.clear();
     label_.clear();
     idx2label_.clear();
@@ -45,7 +45,7 @@ class ImageLabelMap {
       // skip space
       while (isspace(*p) && p != end) ++p;
       image_index_.push_back(static_cast<size_t>(atol(p)));
-      for (size_t i = 0; i < label_width_; ++i) {
+      for (size_t i = 0; i < label_width; ++i) {
         // skip till space
         while (!isspace(*p) && p != end) ++p;
         // skip space
@@ -58,7 +58,7 @@ class ImageLabelMap {
     // be careful not to resize label_ afterwards
     idx2label_.reserve(image_index_.size());
     for (size_t i = 0; i < image_index_.size(); ++i) {
-      idx2label_[image_index_[i]] = dmlc::BeginPtr(label_) + i * label_width_;
+      idx2label_[image_index_[i]] = dmlc::BeginPtr(label_) + i * label_width;
     }
     if (!silent) {
       LOG(INFO) << "Loaded ImageList from " << path_imglist << ' '
@@ -70,12 +70,12 @@ class ImageLabelMap {
     std::unordered_map<size_t, real_t*>::const_iterator it
         = idx2label_.find(imid);
     CHECK(it != idx2label_.end()) << "fail to find imagelabel for id " << imid;
-    return mshadow::Tensor<cpu, 1>(it->second, mshadow::Shape1(label_width_));
+    return mshadow::Tensor<cpu, 1>(it->second, mshadow::Shape1(label_width));
   }
 
  private:
   // label with_
-  mshadow::index_t label_width_;
+  mshadow::index_t label_width;
   // image index of each record
   std::vector<size_t> image_index_;
   // real label content
@@ -87,32 +87,32 @@ class ImageLabelMap {
 // Define image record parser parameters
 struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
   /*! \brief path to image list */
-  std::string path_imglist_;
+  std::string path_imglist;
   /*! \brief path to image recordio */
-  std::string path_imgrec_;
+  std::string path_imgrec;
   /*! \brief number of threads */
-  int nthread_;
+  int nthread;
   /*! \brief whether to remain silent */
-  bool silent_;
+  bool silent;
   /*! \brief number of distributed worker */
-  int dist_num_worker_, dist_worker_rank_;
+  int dist_num_worker, dist_worker_rank;
   /*! \brief label-width */
-  int label_width_;
+  int label_width;
   // declare parameters
   DMLC_DECLARE_PARAMETER(ImageRecParserParam) {
-    DMLC_DECLARE_FIELD(path_imglist_).set_default("")
+    DMLC_DECLARE_FIELD(path_imglist).set_default("")
         .describe("Path to image list.");
-    DMLC_DECLARE_FIELD(path_imgrec_).set_default("./data/imgrec.rec")
+    DMLC_DECLARE_FIELD(path_imgrec).set_default("./data/imgrec.rec")
         .describe("Path to image record file.");
-    DMLC_DECLARE_FIELD(nthread_).set_lower_bound(1).set_default(4)
+    DMLC_DECLARE_FIELD(nthread).set_lower_bound(1).set_default(4)
         .describe("Number of thread to do parsing.");
-    DMLC_DECLARE_FIELD(label_width_).set_lower_bound(1).set_default(1)
+    DMLC_DECLARE_FIELD(label_width).set_lower_bound(1).set_default(1)
         .describe("How many labels for an image.");
-    DMLC_DECLARE_FIELD(silent_).set_default(false)
+    DMLC_DECLARE_FIELD(silent).set_default(false)
         .describe("Whether to output parser information.");
-    DMLC_DECLARE_FIELD(dist_num_worker_).set_lower_bound(1).set_default(1)
+    DMLC_DECLARE_FIELD(dist_num_worker).set_lower_bound(1).set_default(1)
         .describe("Dist worker number.");
-    DMLC_DECLARE_FIELD(dist_worker_rank_).set_default(0)
+    DMLC_DECLARE_FIELD(dist_worker_rank).set_default(0)
         .describe("Dist worker rank.");
   }
 };
@@ -170,12 +170,12 @@ inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, s
   {
     maxthread = std::max(omp_get_num_procs() / 2 - 1, 1);
   }
-  param_.nthread_ = std::min(maxthread, param_.nthread_);
-  #pragma omp parallel num_threads(param_.nthread_)
+  param_.nthread = std::min(maxthread, param_.nthread);
+  #pragma omp parallel num_threads(param_.nthread)
   {
     threadget = omp_get_num_threads();
   }
-  param_.nthread_ = threadget;
+  param_.nthread = threadget;
   // setup decoders
   for (int i = 0; i < threadget; ++i) {
     augmenters_.push_back(new ImageAugmenter());
@@ -187,27 +187,27 @@ inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, s
   // TODO, hack
   const char *ps_rank = getenv("PS_RANK");
   if (ps_rank != NULL) {
-    param_.dist_worker_rank_ = atoi(ps_rank);
+    param_.dist_worker_rank = atoi(ps_rank);
   }
 
-  if (param_.path_imglist_.length() != 0) {
-    label_map_ = new ImageLabelMap(param_.path_imglist_.c_str(),
-                                   param_.label_width_, param_.silent_ != 0);
+  if (param_.path_imglist.length() != 0) {
+    label_map_ = new ImageLabelMap(param_.path_imglist.c_str(),
+                                   param_.label_width, param_.silent != 0);
   } else {
-    param_.label_width_ = 1;
+    param_.label_width = 1;
   }
-  CHECK(param_.path_imgrec_.length() != 0)
+  CHECK(param_.path_imgrec.length() != 0)
     << "ImageRecordIOIterator: must specify image_rec";
 #if MSHADOW_DIST_PS
     // TODO move to a better place
-    param_.dist_num_worker_ = ::ps::RankSize();
-    param_.dist_worker_rank_ = ::ps::MyRank();
-    LOG(INFO) << "rank " << param_.dist_worker_rank_
-              << " in " << param_.dist_num_worker_;
+    param_.dist_num_worker = ::ps::RankSize();
+    param_.dist_worker_rank = ::ps::MyRank();
+    LOG(INFO) << "rank " << param_.dist_worker_rank
+              << " in " << param_.dist_num_worker;
 #endif
   source_ = dmlc::InputSplit::Create
-      (param_.path_imgrec_.c_str(), param_.dist_worker_rank_,
-       param_.dist_num_worker_, "recordio");
+      (param_.path_imgrec.c_str(), param_.dist_worker_rank,
+       param_.dist_num_worker, "recordio");
   // use 64 MB chunk when possible
   source_->HintChunkSize(8 << 20UL);
 }
@@ -217,12 +217,12 @@ ParseNext(std::vector<InstVector> *out_vec) {
   CHECK(source_ != NULL);
   dmlc::InputSplit::Blob chunk;
   if (!source_->NextChunk(&chunk)) return false;
-  out_vec->resize(param_.nthread_);
-  #pragma omp parallel num_threads(param_.nthread_)
+  out_vec->resize(param_.nthread);
+  #pragma omp parallel num_threads(param_.nthread)
   {
-    CHECK(omp_get_num_threads() == param_.nthread_);
+    CHECK(omp_get_num_threads() == param_.nthread);
     int tid = omp_get_thread_num();
-    dmlc::RecordIOChunkReader reader(chunk, tid, param_.nthread_);
+    dmlc::RecordIOChunkReader reader(chunk, tid, param_.nthread);
     ImageRecordIO rec;
     dmlc::InputSplit::Blob blob;
     // image data
@@ -237,7 +237,7 @@ ParseNext(std::vector<InstVector> *out_vec) {
       res = augmenters_[tid]->Process(res, prnds_[tid]);
       out.Push(static_cast<unsigned>(rec.image_index()),
                mshadow::Shape3(3, res.rows, res.cols),
-               mshadow::Shape1(param_.label_width_));
+               mshadow::Shape1(param_.label_width));
       DataInst inst = out.Back();
       // turn datainst into tensor
       mshadow::Tensor<mshadow::cpu, 3> data = inst.data[0].get<mshadow::cpu, 3, float>(); 

From d6ceed317fd1bd82737c8b0361f344910c385d83 Mon Sep 17 00:00:00 2001
From: tianjun <xiaotj1990327@gmail.com>
Date: Sun, 6 Sep 2015 10:05:36 +0800
Subject: [PATCH 05/15] call augprocess in base iter

---
 src/io/image_augmenter.h      | 40 +++++++++++++++++------------------
 src/io/iter_batch.h           |  4 ++--
 src/io/iter_image_recordio.cc | 24 +++++++--------------
 3 files changed, 30 insertions(+), 38 deletions(-)

diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index a81e5297d5b3..38efcc58e61a 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -263,24 +263,6 @@ class ImageAugmenter {
     return tmpres;
   }
 
-  virtual void OpencvProcess(unsigned char *dptr, size_t sz,
-                       mshadow::TensorContainer<cpu, 3> *p_data,
-                       common::RANDOM_ENGINE *prnd) {
-    cv::Mat buf(1, sz, CV_8U, dptr);
-    cv::Mat res = cv::imdecode(buf, 1);
-    res = this->OpencvProcess(res, prnd);
-    p_data->Resize(mshadow::Shape3(3, res.rows, res.cols));
-    for (index_t i = 0; i < p_data->size(1); ++i) {
-      for (index_t j = 0; j < p_data->size(2); ++j) {
-        cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
-        (*p_data)[0][i][j] = bgr[2];
-        (*p_data)[1][i][j] = bgr[1];
-        (*p_data)[2][i][j] = bgr[0];
-      }
-    }
-    res.release();
-  }
-
   void TensorProcess(mshadow::TensorContainer<cpu, 3> *p_data,
                        common::RANDOM_ENGINE *prnd) {
     img_.Resize(mshadow::Shape3((*p_data).shape_[0], param_.input_shape[1], param_.input_shape[2]));
@@ -337,7 +319,7 @@ class ImageAugmenter {
         }
       }
     }
-    out_.data = img_;
+    (*p_data) = img_;
   } 
 
   inline void CreateMeanImg(void) {
@@ -371,7 +353,25 @@ class ImageAugmenter {
     meanfile_ready_ = true;
   }
 
-
+  virtual void Process(unsigned char *dptr, size_t sz,
+                       mshadow::TensorContainer<cpu, 3> *p_data,
+                       common::RANDOM_ENGINE *prnd) {
+    cv::Mat buf(1, sz, CV_8U, dptr);
+    cv::Mat res = cv::imdecode(buf, 1);
+    res = this->OpencvProcess(res, prnd);
+    p_data->Resize(mshadow::Shape3(3, res.rows, res.cols));
+    for (index_t i = 0; i < p_data->size(1); ++i) {
+      for (index_t j = 0; j < p_data->size(2); ++j) {
+        cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
+        (*p_data)[0][i][j] = bgr[2];
+        (*p_data)[1][i][j] = bgr[1];
+        (*p_data)[2][i][j] = bgr[0];
+      }
+    }
+    res.release();
+    this->TensorProcess(p_data, prnd);
+  }
+ 
  private:
   // whether skip opencv processing
   inline bool NeedOpencvProcess(void) const {
diff --git a/src/io/iter_batch.h b/src/io/iter_batch.h
index 4d95b92cce1e..7fe8f4440513 100644
--- a/src/io/iter_batch.h
+++ b/src/io/iter_batch.h
@@ -63,7 +63,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     kwargs_left = param_.InitAllowUnknown(kwargs);
     // init base iterator
     base_->Init(kwargs);
-    mshadow::Shape<4> tshape = shape_;
+    mshadow::Shape<4> tshape = param_.input_shape;
     tshape[0] = param_.batch_size;
     AllocSpaceDense(false);
   }
@@ -141,7 +141,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
   mshadow::Tensor<mshadow::cpu, 4> data;
   // Functions that allocate and free tensor space
   inline void AllocSpaceDense(bool pad = false) { 
-    data = mshadow::NewTensor<mshadow::cpu>(shape_, 0.0f, pad);
+    data = mshadow::NewTensor<mshadow::cpu>(param_.input_shape, 0.0f, pad);
     mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size, param_.label_width);
     label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
     out_.inst_index = new unsigned[param_.batch_size];
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 1589fd5ad6c7..0c44a2346e4a 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -98,6 +98,8 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
   int dist_num_worker, dist_worker_rank;
   /*! \brief label-width */
   int label_width;
+  /*! \brief input shape */
+  TShape input_shape;
   // declare parameters
   DMLC_DECLARE_PARAMETER(ImageRecParserParam) {
     DMLC_DECLARE_FIELD(path_imglist).set_default("")
@@ -114,6 +116,10 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
         .describe("Dist worker number.");
     DMLC_DECLARE_FIELD(dist_worker_rank).set_default(0)
         .describe("Dist worker rank.");
+    float input_shape_default = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape).set_default(TShape(input_shape_default, input_shape_default + 3))
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("Input shape of the neural net");  
   }
 };
 
@@ -229,33 +235,19 @@ ParseNext(std::vector<InstVector> *out_vec) {
     InstVector &out = (*out_vec)[tid];
     out.Clear();
     while (reader.NextRecord(&blob)) {
-      // result holder
-      cv::Mat res;
-      rec.Load(blob.dptr, blob.size);
-      cv::Mat buf(1, rec.content_size, CV_8U, rec.content);
-      res = cv::imdecode(buf, 1);
-      res = augmenters_[tid]->Process(res, prnds_[tid]);
       out.Push(static_cast<unsigned>(rec.image_index()),
-               mshadow::Shape3(3, res.rows, res.cols),
+               mshadow::Shape3(param_.input_shape[0], param_.input_shape[0], param_.input_shape[0]),
                mshadow::Shape1(param_.label_width));
       DataInst inst = out.Back();
       // turn datainst into tensor
       mshadow::Tensor<mshadow::cpu, 3> data = inst.data[0].get<mshadow::cpu, 3, float>(); 
       mshadow::Tensor<mshadow::cpu, 1> label = inst.data[1].get<mshadow::cpu, 1, float>(); 
-      for (int i = 0; i < res.rows; ++i) {
-        for (int j = 0; j < res.cols; ++j) {
-          cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
-          data[0][i][j] = bgr[2];
-          data[1][i][j] = bgr[1];
-          data[2][i][j] = bgr[0];
-        }
-      }
+      augmenters_[tid]->Process(rec.content, rec.content_size, &data, prnd);
       if (label_map_ != NULL) {
         mshadow::Copy(label, label_map_->Find(rec.image_index()));
       } else {
         label[0] = rec.header.label;
       }
-      res.release();
     }
   }
   return true;

From 06637a3995af6a9fdcce360cdb39c8d67ac1020b Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Tue, 8 Sep 2015 01:13:59 +0800
Subject: [PATCH 06/15] recio works

---
 mshadow                       |   2 +-
 src/common/utils.h            |   5 +-
 src/io/image_augmenter.h      | 192 ++++++++++++++++++----------------
 src/io/image_recordio.h       |   8 +-
 src/io/inst_vector.h          |  16 +--
 src/io/io.cc                  |   2 +-
 src/io/iter_batch.h           |  51 +++++----
 src/io/iter_image_recordio.cc |  99 ++++++++++++++----
 src/utils/decoder.h           | 128 -----------------------
 tests/python/test_io.py       |  70 +++++++++----
 10 files changed, 274 insertions(+), 299 deletions(-)
 delete mode 100644 src/utils/decoder.h

diff --git a/mshadow b/mshadow
index 4449f22c6854..3053f8cdfea0 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 4449f22c68543435e5b4f3239de944c03fc0ea46
+Subproject commit 3053f8cdfea0274739282ced015ad458090760e8
diff --git a/src/common/utils.h b/src/common/utils.h
index b5edb78bd6f9..29cb9f0e2f2a 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -22,9 +22,10 @@ namespace common {
  */
 typedef std::mt19937 RANDOM_ENGINE;
 // Get a double float, prnd is the pointer to a Random Engine
-#define NextDouble(prnd) std::generate_canonical<float, 10>(*prnd) 
+#define NextDouble(prnd) std::generate_canonical<float, 10>(*prnd)
 // Get a random int in [0, range)
-#define NextUInt32(range, prnd) static_cast<uint32_t>(floor(std::generate_canonical<float, 10>(*prnd) * range))
+#define NextUInt32(range, prnd) static_cast<uint32_t> \
+(floor(std::generate_canonical<float, 10>(*prnd) * range))
 
 /*!
  * \brief Helper functions.
diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index 38efcc58e61a..a4b77f5a41df 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -1,4 +1,5 @@
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file image_augmenter_opencv.hpp
  * \brief threaded version of page iterator
  * \author Naiyan Wang, Tianqi Chen, Tianjun Xiao
@@ -7,6 +8,10 @@
 #define MXNET_IO_IMAGE_AUGMENTER_H_
 
 #include <opencv2/opencv.hpp>
+#include <utility>
+#include <string>
+#include <algorithm>
+#include <vector>
 #include "../common/utils.h"
 
 namespace mxnet {
@@ -41,6 +46,7 @@ struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
   int rotate;
   /*! \brief filled color while padding */
   int fill_value;
+  // The following are params for tensor process
   /*! \brief whether to mirror the image */
   bool mirror;
   /*! \brief whether to perform rand mirror the image */
@@ -55,14 +61,17 @@ struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
   float mean_b;
   /*! \brief shape of the image data*/
   TShape input_shape;
+  /*! \brief scale on color space */
+  float scale;
   /*! \brief maximum ratio of contrast variation */
-  float max_random_contrast_;
+  float max_random_contrast;
   /*! \brief maximum value of illumination variation */
-  float max_random_illumination_;
+  float max_random_illumination;
+  /*! \brief whether to print augment info */
+  bool silent;
   // declare parameters
-  // TODO: didn't understand the range for some params
   DMLC_DECLARE_PARAMETER(ImageAugmentParam) {
-    DMLC_DECLARE_FIELD(rand_crop_).set_default(true)
+    DMLC_DECLARE_FIELD(rand_crop).set_default(true)
         .describe("Whether we de random cropping");
     DMLC_DECLARE_FIELD(crop_y_start).set_default(-1)
         .describe("Where to nonrandom crop on y");
@@ -81,7 +90,7 @@ struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
     DMLC_DECLARE_FIELD(max_random_scale).set_default(1.0f)
         .describe("Maxmum scale ratio");
     DMLC_DECLARE_FIELD(min_random_scale).set_default(1.0f)
-        .describe("Minimum scale ratio");       
+        .describe("Minimum scale ratio");
     DMLC_DECLARE_FIELD(max_img_size).set_default(1e10f)
         .describe("Maxmum image size");
     DMLC_DECLARE_FIELD(min_img_size).set_default(0.0f)
@@ -99,13 +108,16 @@ struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
     DMLC_DECLARE_FIELD(mean_r).set_default(0.0f)
         .describe("Mean value on R channel");
     DMLC_DECLARE_FIELD(mean_g).set_default(0.0f)
-        .describe("Mean value on G channel");   
+        .describe("Mean value on G channel");
     DMLC_DECLARE_FIELD(mean_b).set_default(0.0f)
         .describe("Mean value on B channel");
-    float input_shape_default = {3, 224, 224};
-    DMLC_DECLARE_FIELD(input_shape).set_default(TShape(input_shape_default, input_shape_default + 3))
+    index_t input_shape_default[] = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape)
+        .set_default(TShape(input_shape_default, input_shape_default + 3))
         .set_expect_ndim(3).enforce_nonzero()
         .describe("Input shape of the neural net");
+    DMLC_DECLARE_FIELD(scale).set_default(1.0f)
+        .describe("Scale in color space");
     DMLC_DECLARE_FIELD(max_random_contrast).set_default(0.0f)
         .describe("Maximum ratio of contrast variation");
     DMLC_DECLARE_FIELD(max_random_illumination).set_default(0.0f)
@@ -123,7 +135,6 @@ class ImageAugmenter {
   }
   virtual ~ImageAugmenter() {
   }
-  // TODO: Hack the shape and rotate list, didn't use param
   virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     kwargs_left = param_.InitAllowUnknown(kwargs);
@@ -142,7 +153,7 @@ class ImageAugmenter {
     if (param_.mean_img.length() != 0) {
       dmlc::Stream *fi = dmlc::Stream::Create(param_.mean_img.c_str(), "r", true);
       if (fi == NULL) {
-        this->CreateMeanImg();
+        meanfile_ready_ = false;
       } else {
         if (param_.silent == 0) {
           printf("loading mean image from %s\n", param_.mean_img.c_str());
@@ -174,15 +185,18 @@ class ImageAugmenter {
     float a = cos(angle / 180.0 * M_PI);
     float b = sin(angle / 180.0 * M_PI);
     // scale
-    float scale = NextDouble(prnd) * (param_.max_random_scale - param_.min_random_scale) + param_.min_random_scale;
+    float scale = NextDouble(prnd) * \
+        (param_.max_random_scale - param_.min_random_scale) + param_.min_random_scale;
     // aspect ratio
-    float ratio = NextDouble(prnd) * param_.max_aspect_ratio * 2 - param_.max_aspect_ratio + 1;
+    float ratio = NextDouble(prnd) * \
+        param_.max_aspect_ratio * 2 - param_.max_aspect_ratio + 1;
     float hs = 2 * scale / (1 + ratio);
     float ws = ratio * hs;
     // new width and height
-    float new_width = std::max(param_.min_img_size, std::min(param_.max_img_size, scale * src.cols));
-    float new_height = std::max(param_.min_img_size, std::min(param_.max_img_size, scale * src.rows));
-    //printf("%f %f %f %f %f %f %f %f %f\n", s, a, b, scale, ratio, hs, ws, new_width, new_height);
+    float new_width = std::max(param_.min_img_size, \
+            std::min(param_.max_img_size, scale * src.cols));
+    float new_height = std::max(param_.min_img_size, \
+            std::min(param_.max_img_size, scale * src.rows));
     cv::Mat M(2, 3, CV_32F);
     M.at<float>(0, 0) = hs * a - s * b * ws;
     M.at<float>(1, 0) = -b * ws;
@@ -192,42 +206,42 @@ class ImageAugmenter {
     float ori_center_height = M.at<float>(1, 0) * src.cols + M.at<float>(1, 1) * src.rows;
     M.at<float>(0, 2) = (new_width - ori_center_width) / 2;
     M.at<float>(1, 2) = (new_height - ori_center_height) / 2;
-    cv::warpAffine(src, temp, M, cv::Size(new_width, new_height),
+    cv::warpAffine(src, temp_, M, cv::Size(new_width, new_height),
                      cv::INTER_LINEAR,
                      cv::BORDER_CONSTANT,
                      cv::Scalar(param_.fill_value, param_.fill_value, param_.fill_value));
-    cv::Mat res = temp;
+    cv::Mat res = temp_;
     // crop
-    if (param_.max_crop_size != -1 || param_.min_crop_size != -1){
-      CHECK(res.cols >= param_.max_crop_size && res.rows >= param_.max_crop_size && param_.max_crop_size >= param_.min_crop_size)
+    if (param_.max_crop_size != -1 || param_.min_crop_size != -1) {
+      CHECK(res.cols >= param_.max_crop_size && res.rows >= \
+              param_.max_crop_size && param_.max_crop_size >= param_.min_crop_size)
           << "input image size smaller than max_crop_size";
-      mshadow::index_t rand_crop_size = NextUInt32(param_.max_crop_size- param_.min_crop_size+1, prnd)+ param_.min_crop_size;
+      mshadow::index_t rand_crop_size = NextUInt32(param_.max_crop_size \
+              - param_.min_crop_size+1, prnd)+ param_.min_crop_size;
       mshadow::index_t y = res.rows - rand_crop_size;
       mshadow::index_t x = res.cols - rand_crop_size;
       if (param_.rand_crop != 0) {
         y = NextUInt32(y + 1, prnd);
         x = NextUInt32(x + 1, prnd);
-      }
-      else {
+      } else {
         y /= 2; x /= 2;
       }
       cv::Rect roi(x, y, rand_crop_size, rand_crop_size);
       cv::resize(res(roi), res, cv::Size(param_.input_shape[1], param_.input_shape[2]));
-    }
-    else{
-      CHECK(static_cast<mshadow::index_t>(res.cols) >= param_.input_shape[1] && static_cast<mshadow::index_t>(res.rows) >= param_.input_shape[2]) 
-          << "input image size smaller than input shape";
-      mshadow::index_t y = res.rows - param_.input_shape[2];
-      mshadow::index_t x = res.cols - param_.input_shape[1];
-      if (param_.rand_crop_ != 0) {
-        y = NextUInt32(y + 1, prnd);
-        x = NextUInt32(x + 1, prnd);
-      }
-      else {
-        y /= 2; x /= 2;
-      }
-      cv::Rect roi(x, y, param_.input_shape[1], param_.input_shape[2]);
-      res = res(roi);
+    } else {
+        CHECK(static_cast<mshadow::index_t>(res.cols) >= param_.input_shape[1] \
+                && static_cast<mshadow::index_t>(res.rows) >= param_.input_shape[2])
+            << "input image size smaller than input shape";
+        mshadow::index_t y = res.rows - param_.input_shape[2];
+        mshadow::index_t x = res.cols - param_.input_shape[1];
+        if (param_.rand_crop != 0) {
+            y = NextUInt32(y + 1, prnd);
+            x = NextUInt32(x + 1, prnd);
+        } else {
+            y /= 2; x /= 2;
+        }
+        cv::Rect roi(x, y, param_.input_shape[1], param_.input_shape[2]);
+        res = res(roi);
     }
     return res;
   }
@@ -251,20 +265,32 @@ class ImageAugmenter {
       }
     }
     res = this->OpencvProcess(res, prnd);
-    tmpres.Resize(mshadow::Shape3(3, res.rows, res.cols));
-    for (index_t i = 0; i < tmpres.size(1); ++i) {
-      for (index_t j = 0; j < tmpres.size(2); ++j) {
+    tmpres_.Resize(mshadow::Shape3(3, res.rows, res.cols));
+    for (index_t i = 0; i < tmpres_.size(1); ++i) {
+      for (index_t j = 0; j < tmpres_.size(2); ++j) {
         cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
-        tmpres[0][i][j] = bgr[2];
-        tmpres[1][i][j] = bgr[1];
-        tmpres[2][i][j] = bgr[0];
+        tmpres_[0][i][j] = bgr[2];
+        tmpres_[1][i][j] = bgr[1];
+        tmpres_[2][i][j] = bgr[0];
       }
     }
-    return tmpres;
+    return tmpres_;
   }
 
   void TensorProcess(mshadow::TensorContainer<cpu, 3> *p_data,
                        common::RANDOM_ENGINE *prnd) {
+    // Check Newly Created mean image
+    if (meanfile_ready_ == false && param_.mean_img.length() != 0) {
+      dmlc::Stream *fi = dmlc::Stream::Create(param_.mean_img.c_str(), "r", true);
+      if (fi != NULL) {
+        if (param_.silent == 0) {
+          printf("loading mean image from %s\n", param_.mean_img.c_str());
+        }
+        meanimg_.LoadBinary(*fi);
+        delete fi;
+        meanfile_ready_ = true;
+      }
+    }
     img_.Resize(mshadow::Shape3((*p_data).shape_[0], param_.input_shape[1], param_.input_shape[2]));
     if (param_.input_shape[1] == 1) {
       img_ = (*p_data) * param_.scale;
@@ -285,72 +311,51 @@ class ImageAugmenter {
       if (p_data->size(2) != param_.input_shape[2] && param_.crop_x_start != -1) {
         xx = param_.crop_x_start;
       }
-      float contrast = NextDouble(prnd) * param_.max_random_contrast * 2 - param_.max_random_contrast + 1;
-      float illumination = NextDouble(prnd) * param_.max_random_illumination * 2 - param_.max_random_illumination;
+      float contrast = NextDouble(prnd) * param_.max_random_contrast \
+                       * 2 - param_.max_random_contrast + 1;
+      float illumination = NextDouble(prnd) * param_.max_random_illumination \
+                           * 2 - param_.max_random_illumination;
       if (param_.mean_r > 0.0f || param_.mean_g > 0.0f || param_.mean_b > 0.0f) {
         // substract mean value
-        (*p_data)[0] -= param_.mean_b; (*p_data)[1] -= param_.mean_g; (*p_data)[2] -= param_.mean_r;
-        if ((param_.rand_mirror != 0 && NextDouble(rnd) < 0.5f) || param_.mirror == 1) {
-          img_ = mirror(crop((*p_data) * contrast + illumination, img_[0].shape_, yy, xx)) * param_.scale;
+        (*p_data)[0] -= param_.mean_b;
+        (*p_data)[1] -= param_.mean_g;
+        (*p_data)[2] -= param_.mean_r;
+        if ((param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) || param_.mirror == 1) {
+          img_ = mirror(crop((*p_data) * contrast + illumination, \
+                      img_[0].shape_, yy, xx)) * param_.scale;
         } else {
-          img_ = crop((*p_data) * contrast + illumination, img_[0].shape_, yy, xx) * param_.scale ;
+          img_ = crop((*p_data) * contrast + illumination, \
+                  img_[0].shape_, yy, xx) * param_.scale;
         }
       } else if (!meanfile_ready_ || param_.mean_img.length() == 0) {
         // do not substract anything
-        if (param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) {
+        if ((param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) || param_.mirror == 1) {
           img_ = mirror(crop((*p_data), img_[0].shape_, yy, xx)) * param_.scale;
         } else {
-          img_ = crop((*p_data), img_[0].shape_, yy, xx) * param_.scale ;
+          img_ = crop((*p_data), img_[0].shape_, yy, xx) * param_.scale;
         }
       } else {
         // substract mean image
         if ((param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) || param_.mirror == 1) {
           if (p_data->shape_ == meanimg_.shape_) {
-            img_ = mirror(crop(((*p_data) - meanimg_) * contrast + illumination, img_[0].shape_, yy, xx)) * param_.scale;
+            img_ = mirror(crop(((*p_data) - meanimg_) * contrast \
+                        + illumination, img_[0].shape_, yy, xx)) * param_.scale;
           } else {
-            img_ = (mirror(crop((*p_data), img_[0].shape_, yy, xx) - meanimg_) * contrast + illumination) * param_.scale;
+            img_ = (mirror(crop((*p_data), img_[0].shape_, yy, xx) - meanimg_) \
+                    * contrast + illumination) * param_.scale;
           }
         } else {
-          if (p_data->shape_ == meanimg_.shape_){
-            img_ = crop(((*p_data) - meanimg_) * contrast + illumination, img_[0].shape_, yy, xx) * param_.scale;
+          if (p_data->shape_ == meanimg_.shape_) {
+            img_ = crop(((*p_data) - meanimg_) * contrast + illumination, \
+                    img_[0].shape_, yy, xx) * param_.scale;
           } else {
-            img_ = ((crop((*p_data), img_[0].shape_, yy, xx) - meanimg_) * contrast + illumination) * param_.scale;
+            img_ = ((crop((*p_data), img_[0].shape_, yy, xx) - meanimg_) * \
+                    contrast + illumination) * param_.scale;
           }
         }
       }
     }
     (*p_data) = img_;
-  } 
-
-  inline void CreateMeanImg(void) {
-    if (silent_ == 0) {
-      printf("cannot find %s: create mean image, this will take some time...\n", name_meanimg_.c_str());
-    }
-    time_t start = time(NULL);
-    unsigned long elapsed = 0;
-    size_t imcnt = 1;
-
-    CHECK(this->Next_()) << "input iterator failed.";
-    meanimg_.Resize(mshadow::Shape3(shape_[0], shape_[1], shape_[2]));
-    mshadow::Copy(meanimg_, img_);
-    while (this->Next()) {
-      meanimg_ += img_; imcnt += 1;
-      elapsed = (long)(time(NULL) - start);
-      if (imcnt % 1000 == 0 && silent_ == 0) {
-        printf("\r                                                               \r");
-        printf("[%8lu] images processed, %ld sec elapsed", imcnt, elapsed);
-        fflush(stdout);
-      }
-    }
-    meanimg_ *= (1.0f / imcnt);
-
-    dmlc::Stream *fo = dmlc::Stream::Create(name_meanimg_.c_str(), "w");
-    meanimg_.SaveBinary(*fo);
-    delete fo;
-    if (silent_ == 0) {
-      printf("save mean image to %s..\n", name_meanimg_.c_str());
-    }
-    meanfile_ready_ = true;
   }
 
   virtual void Process(unsigned char *dptr, size_t sz,
@@ -358,7 +363,8 @@ class ImageAugmenter {
                        common::RANDOM_ENGINE *prnd) {
     cv::Mat buf(1, sz, CV_8U, dptr);
     cv::Mat res = cv::imdecode(buf, 1);
-    res = this->OpencvProcess(res, prnd);
+    if (NeedOpencvProcess())
+        res = this->OpencvProcess(res, prnd);
     p_data->Resize(mshadow::Shape3(3, res.rows, res.cols));
     for (index_t i = 0; i < p_data->size(1); ++i) {
       for (index_t j = 0; j < p_data->size(2); ++j) {
@@ -371,7 +377,7 @@ class ImageAugmenter {
     res.release();
     this->TensorProcess(p_data, prnd);
   }
- 
+
  private:
   // whether skip opencv processing
   inline bool NeedOpencvProcess(void) const {
@@ -391,7 +397,7 @@ class ImageAugmenter {
   // rotation param
   cv::Mat rotateM_;
   // whether the mean file is ready
-  bool menafile_ready_;
+  bool meanfile_ready_;
   // parameters
   ImageAugmentParam param_;
   /*! \brief input shape */
@@ -400,5 +406,5 @@ class ImageAugmenter {
   std::vector<int> rotate_list_;
 };
 }  // namespace io
-}  // namespace cxxnet
-#endif
+}  // namespace mxnet
+#endif  // MXNET_IO_IMAGE_AUGMENTER_H_
diff --git a/src/io/image_recordio.h b/src/io/image_recordio.h
index 4aea8aabcb47..3b4fa0302435 100644
--- a/src/io/image_recordio.h
+++ b/src/io/image_recordio.h
@@ -1,4 +1,5 @@
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file image_recordio.h
  * \brief image recordio struct
  */
@@ -7,6 +8,7 @@
 
 #include <dmlc/base.h>
 #include <dmlc/io.h>
+#include <string>
 
 namespace mxnet {
 namespace io {
@@ -67,9 +69,9 @@ struct ImageRecordIO {
    */
   inline void SaveHeader(std::string *blob) const {
     blob->resize(sizeof(header));
-    std::memcpy(dmlc::BeginPtr(*blob), &header, sizeof(header));    
-  }  
-}; 
+    std::memcpy(dmlc::BeginPtr(*blob), &header, sizeof(header));
+  }
+};
 }  // namespace io
 }  // namespace mxnet
 #endif  // MXNET_IO_IMAGE_RECORDIO_H_
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
index 4ced7dd64c63..ed560fc2b5da 100644
--- a/src/io/inst_vector.h
+++ b/src/io/inst_vector.h
@@ -1,11 +1,12 @@
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file inst_vector.h
  * \brief holder of a sequence of DataInst in CPU
  *        that are not necessarily of same shape
  */
 
-#ifndef MXNET_INST_VECTOR_H_
-#define MXNET_INST_VECTOR_H_
+#ifndef MXNET_IO_INST_VECTOR_H_
+#define MXNET_IO_INST_VECTOR_H_
 
 #include <mxnet/io.h>
 #include <mxnet/base.h>
@@ -31,7 +32,7 @@ class TensorVector {
     CHECK(i + 1 < offset_.size());
     CHECK(shape_[i].Size() == offset_[i + 1] - offset_[i]);
     return mshadow::Tensor<cpu, dim, DType>
-        ((DType*)dmlc::BeginPtr(content_) + offset_[i], shape_[i]);
+        ((DType*)dmlc::BeginPtr(content_) + offset_[i], shape_[i]);  // NOLINT(*)
   }
   inline mshadow::Tensor<cpu, dim, DType> Back() const {
     return (*this)[Size() - 1];
@@ -52,6 +53,7 @@ class TensorVector {
     content_.clear();
     shape_.clear();
   }
+
  private:
   // offset of the data content
   std::vector<size_t> offset_;
@@ -66,7 +68,7 @@ class TensorVector {
  * non-uniform shape data instance in a shape efficient way
  */
 class InstVector {
- public:  
+ public:
   inline size_t Size(void) const {
     return index_.size();
   }
@@ -94,8 +96,8 @@ class InstVector {
     data_.Push(dshape);
     label_.Push(lshape);
   }
-  
- private:  
+
+ private:
   /*! \brief index of the data */
   std::vector<unsigned> index_;
   // label
@@ -105,4 +107,4 @@ class InstVector {
 };
 }  // namespace io
 }  // namespace mxnet
-#endif  // MXNET_TENSOR_VECTOR_H_
+#endif  // MXNET_IO_INST_VECTOR_H_
diff --git a/src/io/io.cc b/src/io/io.cc
index b2dbc9f8c2c5..8bfb5dbdd570 100644
--- a/src/io/io.cc
+++ b/src/io/io.cc
@@ -17,5 +17,5 @@ namespace io {
 // Register parameters in header files
 DMLC_REGISTER_PARAMETER(BatchParam);
 DMLC_REGISTER_PARAMETER(ImageAugmentParam);
-}  // namespace mxnet
 }  // namespace io
+}  // namespace mxnet
diff --git a/src/io/iter_batch.h b/src/io/iter_batch.h
index 7fe8f4440513..b45dfd3328e1 100644
--- a/src/io/iter_batch.h
+++ b/src/io/iter_batch.h
@@ -1,7 +1,8 @@
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_batch_proc-inl.hpp
  * \brief definition of preprocessing iterators that takes an iterator and do some preprocessing
- * \author Tianqi Chen
+ * \author Tianqi Chen, Tianjun Xiao
  */
 #ifndef MXNET_IO_ITER_BATCH_H_
 #define MXNET_IO_ITER_BATCH_H_
@@ -10,6 +11,9 @@
 #include <mxnet/base.h>
 #include <dmlc/logging.h>
 #include <mshadow/tensor.h>
+#include <utility>
+#include <string>
+#include <vector>
 
 namespace mxnet {
 namespace io {
@@ -18,7 +22,6 @@ struct BatchParam : public dmlc::Parameter<BatchParam> {
   /*! \brief label width */
   index_t batch_size;
   /*! \brief input shape */
-  // TODO: haven't modify all shape_
   TShape input_shape;
   /*! \brief label width */
   index_t label_width;
@@ -32,13 +35,14 @@ struct BatchParam : public dmlc::Parameter<BatchParam> {
   DMLC_DECLARE_PARAMETER(BatchParam) {
     DMLC_DECLARE_FIELD(batch_size)
         .describe("Batch size.");
-    float input_shape_default = {3, 224, 224};
-    DMLC_DECLARE_FIELD(input_shape).set_default(TShape(input_shape_default, input_shape_default + 3))
+    index_t input_shape_default[] = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape)
+        .set_default(TShape(input_shape_default, input_shape_default + 3))
         .set_expect_ndim(3).enforce_nonzero()
-        .describe("Input shape of the neural net");   
+        .describe("Input shape of the neural net");
     DMLC_DECLARE_FIELD(label_width).set_default(1)
         .describe("Label width.");
-    DMLC_DECLARE_FIELD(round_batch).set_default(false)
+    DMLC_DECLARE_FIELD(round_batch).set_default(true)
         .describe("Use round robin to handle overflow batch.");
     DMLC_DECLARE_FIELD(test_skipread).set_default(false)
         .describe("Skip read for testing.");
@@ -46,25 +50,25 @@ struct BatchParam : public dmlc::Parameter<BatchParam> {
         .describe("Whether to print batch information.");
   }
 };
-    
+
 /*! \brief create a batch iterator from single instance iterator */
 class BatchAdaptIter: public IIterator<DataBatch> {
-public:
-  BatchAdaptIter(IIterator<DataInst> *base): base_(base) {
-    num_overflow_ = 0;
-  }
+ public:
+  explicit BatchAdaptIter(IIterator<DataInst> *base): base_(base), num_overflow_(0) {}
   virtual ~BatchAdaptIter(void) {
     delete base_;
     FreeSpaceDense();
   }
   virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
-    // init batch param, it could have similar param with 
+    // init batch param, it could have similar param with
     kwargs_left = param_.InitAllowUnknown(kwargs);
     // init base iterator
     base_->Init(kwargs);
-    mshadow::Shape<4> tshape = param_.input_shape;
-    tshape[0] = param_.batch_size;
+    data_shape_[1] = param_.input_shape[0];
+    data_shape_[2] = param_.input_shape[1];
+    data_shape_[3] = param_.input_shape[2];
+    data_shape_[0] = param_.batch_size;
     AllocSpaceDense(false);
   }
   virtual void BeforeFirst(void) {
@@ -80,8 +84,10 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     out_.num_batch_padd = 0;
 
     // skip read if in head version
-    if (param_.test_skipread != 0 && head_ == 0) return true;
-    else this->head_ = 0;
+    if (param_.test_skipread != 0 && head_ == 0)
+        return true;
+    else
+        this->head_ = 0;
 
     // if overflow from previous round, directly return false, until before first is called
     if (num_overflow_ != 0) return false;
@@ -124,7 +130,8 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     CHECK(head_ == 0) << "must call Next to get value";
     return out_;
   }
-private:
+
+ private:
   /*! \brief batch parameters */
   BatchParam param_;
   /*! \brief base iterator */
@@ -139,9 +146,11 @@ class BatchAdaptIter: public IIterator<DataBatch> {
   mshadow::Tensor<mshadow::cpu, 2> label;
   /*! \brief content of dense data, if this DataBatch is dense */
   mshadow::Tensor<mshadow::cpu, 4> data;
+  /*! \brief data shape */
+  mshadow::Shape<4> data_shape_;
   // Functions that allocate and free tensor space
-  inline void AllocSpaceDense(bool pad = false) { 
-    data = mshadow::NewTensor<mshadow::cpu>(param_.input_shape, 0.0f, pad);
+  inline void AllocSpaceDense(bool pad = false) {
+    data = mshadow::NewTensor<mshadow::cpu>(data_shape_, 0.0f, pad);
     mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size, param_.label_width);
     label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
     out_.inst_index = new unsigned[param_.batch_size];
@@ -157,7 +166,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
       label.dptr_ = NULL;
     }
   }
-}; // class BatchAdaptIter
+};  // class BatchAdaptIter
 }  // namespace io
-}  // namespace cxxnet
+}  // namespace mxnet
 #endif  // MXNET_IO_ITER_BATCH_H_
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 0c44a2346e4a..701c28deb4c9 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -1,9 +1,9 @@
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_image_recordio-inl.hpp
  * \brief recordio data
 iterator
  */
-#include <cstdlib>
 #include <dmlc/base.h>
 #include <dmlc/io.h>
 #include <dmlc/omp.h>
@@ -13,11 +13,11 @@ iterator
 #include <dmlc/threadediter.h>
 #include <unordered_map>
 #include <vector>
+#include <cstdlib>
 #include "./inst_vector.h"
 #include "./image_recordio.h"
 #include "./image_augmenter.h"
 #include "./iter_batch.h"
-#include "../utils/decoder.h"
 namespace mxnet {
 namespace io {
 /*! \brief data structure to hold labels for images */
@@ -31,7 +31,7 @@ class ImageLabelMap {
   explicit ImageLabelMap(const char *path_imglist,
                          mshadow::index_t label_width,
                          bool silent) {
-    label_width = label_width;
+    this->label_width = label_width;
     image_index_.clear();
     label_.clear();
     idx2label_.clear();
@@ -116,10 +116,11 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
         .describe("Dist worker number.");
     DMLC_DECLARE_FIELD(dist_worker_rank).set_default(0)
         .describe("Dist worker rank.");
-    float input_shape_default = {3, 224, 224};
-    DMLC_DECLARE_FIELD(input_shape).set_default(TShape(input_shape_default, input_shape_default + 3))
+    index_t input_shape_default[] = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape)
+        .set_default(TShape(input_shape_default, input_shape_default + 3))
         .set_expect_ndim(3).enforce_nonzero()
-        .describe("Input shape of the neural net");  
+        .describe("Input shape of the neural net");
   }
 };
 
@@ -143,7 +144,7 @@ class ImageRecordIOParser {
   }
   // initialize the parser
   inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs);
-  
+
   // set record to the head
   inline void BeforeFirst(void) {
     return source_->BeforeFirst();
@@ -151,11 +152,12 @@ class ImageRecordIOParser {
   // parse next set of records, return an array of
   // instance vector to the user
   inline bool ParseNext(std::vector<InstVector> *out);
+
  private:
   // magic nyumber to see prng
   static const int kRandMagic = 111;
   /*! \brief parameters */
-  ImageRecParserParam param_; 
+  ImageRecParserParam param_;
   /*! \brief augmenters */
   std::vector<ImageAugmenter*> augmenters_;
   /*! \brief random samplers */
@@ -164,9 +166,12 @@ class ImageRecordIOParser {
   dmlc::InputSplit *source_;
   /*! \brief label information, if any */
   ImageLabelMap *label_map_;
+  /*! \brief temp space */
+  mshadow::TensorContainer<cpu, 3> img_;
 };
 
-inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+inline void ImageRecordIOParser::Init(
+        const std::vector<std::pair<std::string, std::string> >& kwargs) {
   // initialize parameter
   std::vector<std::pair<std::string, std::string> > kwargs_left;
   // init image rec param
@@ -185,12 +190,11 @@ inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, s
   // setup decoders
   for (int i = 0; i < threadget; ++i) {
     augmenters_.push_back(new ImageAugmenter());
-    augmenters_[i]->Init(kwargs_left);
+    augmenters_[i]->Init(kwargs);
     prnds_.push_back(new common::RANDOM_ENGINE((i + 1) * kRandMagic));
   }
-  
+
   // handling for hadoop
-  // TODO, hack
   const char *ps_rank = getenv("PS_RANK");
   if (ps_rank != NULL) {
     param_.dist_worker_rank = atoi(ps_rank);
@@ -205,7 +209,6 @@ inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, s
   CHECK(param_.path_imgrec.length() != 0)
     << "ImageRecordIOIterator: must specify image_rec";
 #if MSHADOW_DIST_PS
-    // TODO move to a better place
     param_.dist_num_worker = ::ps::RankSize();
     param_.dist_worker_rank = ::ps::MyRank();
     LOG(INFO) << "rank " << param_.dist_worker_rank
@@ -235,14 +238,16 @@ ParseNext(std::vector<InstVector> *out_vec) {
     InstVector &out = (*out_vec)[tid];
     out.Clear();
     while (reader.NextRecord(&blob)) {
+      rec.Load(blob.dptr, blob.size);
       out.Push(static_cast<unsigned>(rec.image_index()),
-               mshadow::Shape3(param_.input_shape[0], param_.input_shape[0], param_.input_shape[0]),
+               mshadow::Shape3(param_.input_shape[0], param_.input_shape[1], param_.input_shape[2]),
                mshadow::Shape1(param_.label_width));
       DataInst inst = out.Back();
       // turn datainst into tensor
-      mshadow::Tensor<mshadow::cpu, 3> data = inst.data[0].get<mshadow::cpu, 3, float>(); 
-      mshadow::Tensor<mshadow::cpu, 1> label = inst.data[1].get<mshadow::cpu, 1, float>(); 
-      augmenters_[tid]->Process(rec.content, rec.content_size, &data, prnd);
+      mshadow::Tensor<mshadow::cpu, 3> data = inst.data[0].get<mshadow::cpu, 3, float>();
+      mshadow::Tensor<mshadow::cpu, 1> label = inst.data[1].get<mshadow::cpu, 1, float>();
+      augmenters_[tid]->Process(rec.content, rec.content_size, &img_, prnds_[tid]);
+      mshadow::Copy(data, img_);
       if (label_map_ != NULL) {
         mshadow::Copy(label, label_map_->Find(rec.image_index()));
       } else {
@@ -259,12 +264,20 @@ struct ImageRecordParam: public dmlc::Parameter<ImageRecordParam> {
   bool shuffle;
   /*! \brief random seed */
   int seed;
+  /*! \brief mean file string*/
+  std::string mean_img;
+  /*! \brief whether to remain silent */
+  bool silent;
   // declare parameters
   DMLC_DECLARE_PARAMETER(ImageRecordParam) {
     DMLC_DECLARE_FIELD(shuffle).set_default(true)
         .describe("Whether to shuffle data.");
     DMLC_DECLARE_FIELD(seed).set_default(0)
         .describe("Random Seed.");
+    DMLC_DECLARE_FIELD(mean_img).set_default("./data/mean.bin")
+        .describe("Path to image mean file.");
+    DMLC_DECLARE_FIELD(silent).set_default(false)
+        .describe("Whether to output information.");
   }
 };
 
@@ -283,8 +296,8 @@ class ImageRecordIter : public IIterator<DataInst> {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     // init image rec param
     kwargs_left = param_.InitAllowUnknown(kwargs);
-    // use the left kwarg to init parser
-    parser_.Init(kwargs_left);
+    // use the kwarg to init parser
+    parser_.Init(kwargs);
     // init thread iter
     iter_.set_max_capacity(4);
     iter_.Init([this](std::vector<InstVector> **dptr) {
@@ -294,6 +307,15 @@ class ImageRecordIter : public IIterator<DataInst> {
         return parser_.ParseNext(*dptr);
       },
       [this]() { parser_.BeforeFirst(); });
+    // Check Meanfile
+    if (param_.mean_img.length() != 0) {
+      dmlc::Stream *fi = dmlc::Stream::Create(param_.mean_img.c_str(), "r", true);
+      if (fi == NULL) {
+        this->CreateMeanImg();
+      } else {
+        delete fi;
+      }
+    }
     inst_ptr_ = 0;
   }
   virtual void BeforeFirst(void) {
@@ -320,7 +342,8 @@ class ImageRecordIter : public IIterator<DataInst> {
         }
         // shuffle instance order if needed
         if (shuffle_ != 0) {
-            std::shuffle(inst_order_.begin(), inst_order_.end(), common::RANDOM_ENGINE(kRandMagic + param_.seed));
+            std::shuffle(inst_order_.begin(), inst_order_.end(), \
+                    common::RANDOM_ENGINE(kRandMagic + param_.seed));
         }
         inst_ptr_ = 0;
       }
@@ -332,6 +355,40 @@ class ImageRecordIter : public IIterator<DataInst> {
   }
 
  private:
+  inline void CreateMeanImg(void) {
+    if (param_.silent == 0) {
+      printf("cannot find %s: create mean image, this will take some time...\n",
+              param_.mean_img.c_str());
+    }
+    time_t start = time(NULL);
+    uint64_t elapsed = 0;
+    size_t imcnt = 1;
+    this->BeforeFirst();
+    CHECK(this->Next()) << "input iterator failed.";
+    // Get the first data
+    mshadow::Tensor<mshadow::cpu, 3> img_tensor = out_.data[0].get<mshadow::cpu, 3, float>();
+    meanimg_.Resize(img_tensor.shape_);
+    mshadow::Copy(meanimg_, img_tensor);
+    while (this->Next()) {
+      mshadow::Tensor<mshadow::cpu, 3> img_tensor = out_.data[0].get<mshadow::cpu, 3, float>();
+      meanimg_ += img_tensor; imcnt += 1;
+      elapsed = (uint64_t)(time(NULL) - start);
+      if (imcnt % 1000 == 0 && param_.silent == 0) {
+        printf("\r                                                               \r");
+        printf("[%8lu] images processed, %ld sec elapsed", imcnt, elapsed);
+        fflush(stdout);
+      }
+    }
+    meanimg_ *= (1.0f / imcnt);
+
+    dmlc::Stream *fo = dmlc::Stream::Create(param_.mean_img.c_str(), "w");
+    meanimg_.SaveBinary(*fo);
+    delete fo;
+    if (param_.silent == 0) {
+      printf("save mean image to %s..\n", param_.mean_img.c_str());
+    }
+  }
+
   // random magic
   static const int kRandMagic = 111;
   // output instance
@@ -350,6 +407,8 @@ class ImageRecordIter : public IIterator<DataInst> {
   dmlc::ThreadedIter<std::vector<InstVector> > iter_;
   // parameters
   ImageRecordParam param_;
+  // mean image
+  mshadow::TensorContainer<cpu, 3> meanimg_;
 };
 DMLC_REGISTER_PARAMETER(ImageRecParserParam);
 DMLC_REGISTER_PARAMETER(ImageRecordParam);
diff --git a/src/utils/decoder.h b/src/utils/decoder.h
deleted file mode 100644
index 52db01edee23..000000000000
--- a/src/utils/decoder.h
+++ /dev/null
@@ -1,128 +0,0 @@
-#ifndef MXNET_UTILS_DECODER_H_
-#define MXNET_UTILS_DECODER_H_
-
-#include <vector>
-#if MXNET_USE_OPENCV_DECODER == 0
-  #include <jpeglib.h>
-  #include <setjmp.h>
-  #include <jerror.h>
-#endif
-#include <dmlc/logging.h>
-#include <mshadow/tensor.h>
-#if MXNET_USE_OPENCV
-  #include <opencv2/opencv.hpp>
-#endif
-
-namespace mxnet {
-namespace utils {
-
-#if MXNET_USE_OPENCV_DECODER == 0
-struct JpegDecoder {
-public:
-  JpegDecoder(void) {
-    cinfo.err = jpeg_std_error(&jerr.base);
-    jerr.base.error_exit = jerror_exit;
-    jerr.base.output_message = joutput_message;
-    jpeg_create_decompress(&cinfo);
-  }
-  // destructor
-  ~JpegDecoder(void) {
-    jpeg_destroy_decompress(&cinfo);
-  }
-
-  inline void Decode(unsigned char *ptr, size_t sz,
-                     mshadow::TensorContainer<cpu, 3, unsigned char> *p_data) {
-    if(setjmp(jerr.jmp)) {
-      jpeg_destroy_decompress(&cinfo);
-      dmlc::Error("Libjpeg fail to decode");
-    }
-    this->jpeg_mem_src(&cinfo, ptr, sz);
-    CHECK(jpeg_read_header(&cinfo, TRUE) == JPEG_HEADER_OK) << "libjpeg: failed to decode";
-    CHECK(jpeg_start_decompress(&cinfo) == true) << "libjpeg: failed to decode";
-    p_data->Resize(mshadow::Shape3(cinfo.output_height, cinfo.output_width, cinfo.output_components));
-    JSAMPROW jptr = &((*p_data)[0][0][0]);
-    while (cinfo.output_scanline < cinfo.output_height) {
-      CHECK(jpeg_read_scanlines(&cinfo, &jptr, 1) == true) << "libjpeg: failed to decode";
-      jptr += cinfo.output_width * cinfo.output_components;
-    }
-    CHECK(jpeg_finish_decompress(&cinfo) == true) << "libjpeg: failed to decode");
-  }
-private:
-  struct jerror_mgr {
-    jpeg_error_mgr base;
-    jmp_buf jmp;
-  };
-
-  METHODDEF(void) jerror_exit(j_common_ptr jinfo) {
-    jerror_mgr* err = (jerror_mgr*)jinfo->err;
-    longjmp(err->jmp, 1);
-  }
-
-  METHODDEF(void) joutput_message(j_common_ptr) {}
-
-  static boolean mem_fill_input_buffer_ (j_decompress_ptr cinfo) {
-    dmlc::Error("JpegDecoder: bad jpeg image");
-    return true;
-  }
-
-  static void mem_skip_input_data_ (j_decompress_ptr cinfo, long num_bytes_) {
-    jpeg_source_mgr *src = cinfo->src;
-    size_t num_bytes = static_cast<size_t>(num_bytes_);
-    if (num_bytes > 0) {
-      src->next_input_byte += num_bytes;
-      CHECK(src->bytes_in_buffer >= num_bytes) << "fail to decode";
-      src->bytes_in_buffer -= num_bytes;
-    } else {
-      dmlc::Error("JpegDecoder: bad jpeg image");
-
-    }
-  }
-
-  static void mem_term_source_ (j_decompress_ptr cinfo) {}
-  static void mem_init_source_ (j_decompress_ptr cinfo) {}
-  static boolean jpeg_resync_to_restart_(j_decompress_ptr cinfo, int desired) {
-    dmlc::Error("JpegDecoder: bad jpeg image");
-    return true;
-  }
-  void jpeg_mem_src (j_decompress_ptr cinfo, void* buffer, long nbytes) {
-    src.init_source = mem_init_source_;
-    src.fill_input_buffer = mem_fill_input_buffer_;
-    src.skip_input_data = mem_skip_input_data_;
-    src.resync_to_restart = jpeg_resync_to_restart_;
-    src.term_source = mem_term_source_;
-    src.bytes_in_buffer = nbytes;
-    src.next_input_byte = static_cast<JOCTET*>(buffer);
-    cinfo->src = &src;
-  }
-
-private:
-  jpeg_decompress_struct cinfo;
-  jpeg_source_mgr src;
-  jerror_mgr jerr;
-};
-#endif
-
-#if MXNET_USE_OPENCV
-struct OpenCVDecoder {
-  void Decode(unsigned char *ptr, size_t sz, mshadow::TensorContainer<cpu, 3, unsigned char> *p_data) {
-    cv::Mat buf(1, sz, CV_8U, ptr);
-    cv::Mat res = cv::imdecode(buf, 1);
-    CHECK(res.data != NULL) << "decoding fail";
-    p_data->Resize(mshadow::Shape3(res.rows, res.cols, 3));
-    for (int y = 0; y < res.rows; ++y) {
-      for (int x = 0; x < res.cols; ++x) {
-        cv::Vec3b bgr = res.at<cv::Vec3b>(y, x);
-        // store in RGB order
-        (*p_data)[y][x][2] = bgr[0];
-        (*p_data)[y][x][1] = bgr[1];
-        (*p_data)[y][x][0] = bgr[2];
-      }
-    }
-    res.release();
-  }
-};
-#endif
-} // namespace utils
-} // namespace mxnet
-
-#endif // DECODER_H
diff --git a/tests/python/test_io.py b/tests/python/test_io.py
index 991a4813033e..8706b062e5d7 100644
--- a/tests/python/test_io.py
+++ b/tests/python/test_io.py
@@ -5,28 +5,29 @@
 import pickle as pickle
 import sys
 import get_data
+from PIL import Image
 
-# prepare data
-get_data.GetMNIST_ubyte()
 
-batch_size = 100
-train_dataiter = mx.io.MNISTIter(
-        image="data/train-images-idx3-ubyte",
-        label="data/train-labels-idx1-ubyte",
-        batch_size=batch_size, shuffle=1, flat=1, silent=0, seed=10)
-val_dataiter = mx.io.MNISTIter(
-        image="data/t10k-images-idx3-ubyte",
-        label="data/t10k-labels-idx1-ubyte",
-        batch_size=batch_size, shuffle=0, flat=1, silent=0)
+def test_MNISTIter():
+    # prepare data
+    get_data.GetMNIST_ubyte()
 
-def test_MNISTIter_loop():
+    batch_size = 100
+    train_dataiter = mx.io.MNISTIter(
+            image="data/train-images-idx3-ubyte",
+            label="data/train-labels-idx1-ubyte",
+            batch_size=batch_size, shuffle=1, flat=1, silent=0, seed=10)
+    val_dataiter = mx.io.MNISTIter(
+            image="data/t10k-images-idx3-ubyte",
+            label="data/t10k-labels-idx1-ubyte",
+            batch_size=batch_size, shuffle=0, flat=1, silent=0)
+    # test_loop
     nbatch = 60000 / batch_size
     batch_count = 0
     for data, label in train_dataiter:
         batch_count += 1
     assert(nbatch == batch_count)
-
-def test_MNISTIter_reset():
+    # test_reset
     train_dataiter.reset()
     train_dataiter.iter_next()
     label_0 = train_dataiter.getlabel().numpy.flatten()
@@ -40,17 +41,40 @@ def test_MNISTIter_reset():
     assert(sum(label_0 - label_1) == 0)
 
 def test_ImageRecIter():
-    dataiter = mx.io.ImageRecordIter(path_imgrec="data/val_cxxnet.rec",
-            image_mean="data/val_cxxnet_mean.bin",
+    dataiter = mx.io.ImageRecordIter(
+            #path_imglist="data/smallset/val_cxxnet5000.txt", 
+            path_imgrec="data/val_cxxnet.rec",
+            #mean_img="data/smallset/image_net_mean.bin",
             rand_crop=True,
-            rand_mirror=True,
-            input_shape="3,224,224",
-            batch_size=128)
-
-
-
-
+            mirror=True,
+            input_shape=(3,227,227),
+            batch_size=100,
+            nthread=1,
+            seed=10)
+    # Test label read 
+    labelcount = [0 for i in range(1000)] 
+    batchcount = 0
+    for data, label in dataiter:
+        npdata = data.numpy
+        print npdata[0,:,:,:]
+        imgdata = np.zeros([227, 227, 3], dtype=np.uint8)
+        imgdata[:,:,0] = npdata[10,2,:,:]
+        imgdata[:,:,1] = npdata[10,1,:,:]
+        imgdata[:,:,2] = npdata[10,0,:,:]
+        img = Image.fromarray(imgdata)
+        imgpath = "data/smallset/test_3.jpg"
+        img.save(imgpath, format='JPEG')
 
+        exit(0)
+        print batchcount
+        sys.stdout.flush()
+        batchcount += 1
+        nplabel = label.numpy
+        for i in range(nplabel.shape[0]):
+            labelcount[int(nplabel[i])] += 1
+    # Test image
 
 
+if __name__ == '__main__':
+    test_ImageRecIter()
 

From ccfba8944eaeb2be18e51d861547c45a3e53f4c9 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Thu, 3 Sep 2015 08:42:14 +0800
Subject: [PATCH 07/15] merging the code, not compiled

---
 include/mxnet/io.h   | 16 ++++++++
 src/common/utils.h   |  5 +++
 src/io/inst_vector.h | 92 ++++++++++++++++++++------------------------
 src/io/io.cc         |  9 ++++-
 src/io/iter_mnist.cc |  4 +-
 5 files changed, 72 insertions(+), 54 deletions(-)

diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 47a59eec54fe..5a8267befc1c 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -109,5 +109,21 @@ struct DataIteratorReg
   }                                                                     \
   DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name) \
   .set_body(__create__ ## DataIteratorType ## __)
+/*!
+ * \brief Macro to register chained Iterators
+ *
+ * \code
+ * // example of registering a imagerec iterator
+ * MXNET_REGISTER_IO_CHAINED_ITERATOR(ImageRec, ImageRecordIter, BatchIter)
+ * .describe("batched image record data iterator");
+ *
+ * \endcode
+ */
+#define MXNET_REGISTER_IO_CHAINED_ITER(name, ChainedDataIterType, HoldingDataIterType)          \
+  static ::mxnet::IIterator<DataBatch>* __create__ ## ChainedDataIteratorType ## __() { \
+    return new HoldingDataIteratorType(new ChainedDataIterType);                                    \
+  }                                                                     \
+  DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name) \
+  .set_body(__create__ ## ChainedDataIteratorType ## __)
 }  // namespace mxnet
 #endif  // MXNET_IO_H_
diff --git a/src/common/utils.h b/src/common/utils.h
index cf1fd2f1bb36..f7a2dcce0470 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -22,6 +22,11 @@ namespace common {
  */
 typedef std::mt19937 RANDOM_ENGINE;
 
+// Get a double float, prnd is the pointer to a Random Engine
+#define NextDouble(prnd) std::generate_canonical<float, 10>(*prnd) 
+
+#define NextUInt32(range, prnd) static_cast<uint32_t>(\
+        floor(std::generate_canonical<float, 10>(*prnd) * range))
 /*!
  * \brief Helper functions.
  */
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
index 1ae734631680..9490ceab94c1 100644
--- a/src/io/inst_vector.h
+++ b/src/io/inst_vector.h
@@ -1,17 +1,19 @@
 /*!
- *  Copyright (c) 2015 by Contributors
- * \inst_vector.h
+ * \file inst_vector.h
  * \brief holder of a sequence of DataInst in CPU
  *        that are not necessarily of same shape
  */
-#ifndef MXNET_IO_INST_VECTOR_H_
-#define MXNET_IO_INST_VECTOR_H_
+
+#ifndef MXNET_INST_VECTOR_H_
+#define MXNET_INST_VECTOR_H_
+
+#include "./data.h"
+#include <vector>
 #include <dmlc/base.h>
 #include <mshadow/tensor.h>
-#include <vector>
-#include <string>
-#include "./data.h"
+
 namespace mxnet {
+namespace io {
 /*!
  * \brief tensor vector that can store sequence of tensor
  *  in a memory compact way, tensors do not have to be of same shape
@@ -28,7 +30,7 @@ class TensorVector {
     CHECK(i + 1 < offset_.size());
     CHECK(shape_[i].Size() == offset_[i + 1] - offset_[i]);
     return mshadow::Tensor<cpu, dim, DType>
-        (reinterpret_cast<DType*>(BeginPtr(content_)) + offset_[i], shape_[i]);
+        ((DType*)BeginPtr(content_) + offset_[i], shape_[i]);
   }
   inline mshadow::Tensor<cpu, dim, DType> Back() const {
     return (*this)[Size() - 1];
@@ -49,7 +51,6 @@ class TensorVector {
     content_.clear();
     shape_.clear();
   }
-
  private:
   // offset of the data content
   std::vector<size_t> offset_;
@@ -59,59 +60,48 @@ class TensorVector {
   std::vector<mshadow::Shape<dim> > shape_;
 };
 
-/*!
- * \brief tblob vector that can store sequence of tblob
- *  in a memory compact way, tblobs do not have to be of same shape
- */
-template<typename DType>
-class TBlobVector {
- public:
-  TBlobVector(void) {
-    this->Clear();
-  }
-  // get i-th tblob
-  inline TBlob operator[](size_t i) const;
-  // get the last tblob
-  inline TBlob Back();
-  // return the size of the vector
-  inline size_t Size(void) const;
-  // push a tensor of certain shape
-  // return the reference of the pushed tensor
-  inline void Push(TShape shape_);
-  inline void Clear(void);
- private:
-  // offset of the data content
-  std::vector<size_t> offset_;
-  // data content
-  std::vector<DType> content_;
-  // shape of data
-  std::vector<TShape > shape_;
-};
-
 /*!
  * \brief instance vector that can holds
  * non-uniform shape data instance in a shape efficient way
  */
 class InstVector {
- public:
+ public:  
   inline size_t Size(void) const {
     return index_.size();
   }
   // instance
-  inline DataInst operator[](size_t i) const;
+  inline DataInst operator[](size_t i) const {
+    DataInst inst;
+    inst.index = index_[i];
+    inst.data = data_[i];
+    inst.label = label_[i];
+    return inst;
+  }
   // get back of instance vector
-  inline DataInst Back() const;
-  // clear the container
-  inline void Clear(void);
-  // push the newly coming instance
-  inline void Push(unsigned index, TBlob data_);
-
- private:
+  inline DataInst Back() const {
+    return (*this)[Size() - 1];
+  }
+  inline void Clear(void) {
+    index_.clear();
+    data_.Clear();
+    label_.Clear();
+  }
+  inline void Push(unsigned index,
+                   mshadow::Shape<3> dshape,
+                   mshadow::Shape<1> lshape) {
+    index_.push_back(index);
+    data_.Push(dshape);
+    label_.Push(lshape);
+  }
+  
+ private:  
   /*! \brief index of the data */
   std::vector<unsigned> index_;
+  // label
+  TensorVector<3, real_t> data_;
   // data
-  std::vector<TensorVector<real_t> > data_;
-  // extra data
-  std::vector<std::string> extra_data_;
+  TensorVector<1, real_t> label_;
 };
-#endif  // MXNET_IO_INST_VECTOR_H_
+}  // namespace io
+}  // namespace mxnet
+#endif  // MXNET_TENSOR_VECTOR_H_
diff --git a/src/io/io.cc b/src/io/io.cc
index bd5b78dda643..9095f4089c92 100644
--- a/src/io/io.cc
+++ b/src/io/io.cc
@@ -4,7 +4,14 @@
 
 #include <mxnet/io.h>
 #include <dmlc/registry.h>
+#include <image_augmenter.h>
+#include <>
+#include <iter_batch.h>
 
+// Registers
 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::mxnet::DataIteratorReg);
-}  // namespace dmlc
+// Register parameters in header files
+DMLC_REGISTER_PARAMETER(BatchParam);
+DMLC_REGISTER_PARAMETER(ImageAugmenterParam);
+}  // namespace dmlc
\ No newline at end of file
diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc
index 93195061b278..77ac3a479f75 100644
--- a/src/io/iter_mnist.cc
+++ b/src/io/iter_mnist.cc
@@ -31,7 +31,7 @@ struct MNISTParam : public dmlc::Parameter<MNISTParam> {
   bool flat;
   /*! \brief random seed */
   int seed;
-  // declare parameters in header file
+  // declare parameters
   DMLC_DECLARE_PARAMETER(MNISTParam) {
     DMLC_DECLARE_FIELD(image).set_default("./train-images-idx3-ubyte")
         .describe("Mnist image path.");
@@ -155,7 +155,7 @@ class MNISTIter: public IIterator<DataBatch> {
     delete stdlabel;
   }
   inline void Shuffle(void) {
-    std::shuffle(inst_.begin(), inst_.end(), common::RANDOM_ENGINE(kRandMagic+param_.seed));
+    std::shuffle(inst_.begin(), inst_.end(), common::RANDOM_ENGINE(kRandMagic + param_.seed));
     std::vector<float> tmplabel(labels_.size());
     mshadow::TensorContainer<cpu, 3> tmpimg(img_.shape_);
     for (size_t i = 0; i < inst_.size(); ++i) {

From a54f6c9f4bc44fdc3910ccfd97ee4352df077913 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Thu, 3 Sep 2015 08:42:50 +0800
Subject: [PATCH 08/15] add image rec and associate files in

---
 src/io/image_augmenter.h      | 262 ++++++++++++++++++++++++
 src/io/image_recordio.h       |  75 +++++++
 src/io/iter_batch.h           | 162 +++++++++++++++
 src/io/iter_image_recordio.cc | 369 ++++++++++++++++++++++++++++++++++
 src/utils/decoder.h           | 128 ++++++++++++
 src/utils/io.h                | 175 ++++++++++++++++
 src/utils/thread_buffer.h     | 205 +++++++++++++++++++
 7 files changed, 1376 insertions(+)
 create mode 100644 src/io/image_augmenter.h
 create mode 100644 src/io/image_recordio.h
 create mode 100644 src/io/iter_batch.h
 create mode 100644 src/io/iter_image_recordio.cc
 create mode 100644 src/utils/decoder.h
 create mode 100644 src/utils/io.h
 create mode 100644 src/utils/thread_buffer.h

diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
new file mode 100644
index 000000000000..d33464c4a889
--- /dev/null
+++ b/src/io/image_augmenter.h
@@ -0,0 +1,262 @@
+/*!
+ * \file image_augmenter_opencv.hpp
+ * \brief threaded version of page iterator
+ * \author Naiyan Wang, Tianqi Chen
+ */
+#ifndef MXNET_IO_IMAGE_AUGMENTER_H_
+#define MXNET_IO_IMAGE_AUGMENTER_H_
+
+#include <opencv2/opencv.hpp>
+#include "../common/utils.h"
+
+namespace mxnet {
+namespace io {
+/*! \brief image augmentation parameters*/
+struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
+  /*! \brief whether we do random cropping */
+  bool rand_crop_;
+  /*! \brief whether we do nonrandom croping */
+  int crop_y_start_;
+  /*! \brief whether we do nonrandom croping */
+  int crop_x_start_;
+  /*! \brief Indicate the max ratation angle for augmentation, we will random rotate */
+  /*! \brief [-max_rotate_angle, max_rotate_angle] */
+  int max_rotate_angle_;
+  /*! \brief max aspect ratio */
+  float max_aspect_ratio_;
+  /*! \brief random shear the image [-max_shear_ratio, max_shear_ratio] */
+  float max_shear_ratio_;
+  /*! \brief max crop size */
+  int max_crop_size_;
+  /*! \brief min crop size */
+  int min_crop_size_;
+  /*! \brief max scale ratio */
+  float max_random_scale_;
+  /*! \brief min scale_ratio */
+  float min_random_scale_;
+  /*! \brief min image size */
+  float min_img_size_;
+  /*! \brief max image size */
+  float max_img_size_;
+  /*! \brief whether to mirror the image */
+  bool mirror_;
+  /*! \brief rotate angle */
+  int rotate_;
+  /*! \brief filled color while padding */
+  int fill_value_;
+  // declare parameters
+  // TODO: didn't understand the range for some params
+  DMLC_DECLARE_PARAMETER(ImageAugmentParam) {
+    DMLC_DECLARE_FIELD(rand_crop_).set_default(true)
+        .describe("Whether we de random cropping");
+    DMLC_DECLARE_FIELD(crop_y_start_).set_default(-1)
+        .describe("Where to nonrandom crop on y");
+    DMLC_DECLARE_FIELD(crop_x_start_).set_default(-1)
+        .describe("Where to nonrandom crop on x");
+    DMLC_DECLARE_FIELD(max_rotate_angle_).set_default(0.0f)
+        .describe("Rotate can be [-max_rotate_angle, max_rotate_angle]");
+    DMLC_DECLARE_FIELD(max_aspect_ratio_).set_default(0.0f)
+        .describe("Max aspect ratio");
+    DMLC_DECLARE_FIELD(max_shear_ratio_).set_default(0.0f)
+        .describe("Shear rotate can be made between [-max_shear_ratio_, max_shear_ratio_]");
+    DMLC_DECLARE_FIELD(max_crop_size_).set_default(-1)
+        .describe("Maximum crop size");
+    DMLC_DECLARE_FIELD(min_crop_size_).set_default(-1)
+        .describe("Minimum crop size");
+    DMLC_DECLARE_FIELD(max_random_scale_).set_default(1.0f)
+        .describe("Maxmum scale ratio");
+    DMLC_DECLARE_FIELD(min_random_scale_).set_default(1.0f)
+        .describe("Minimum scale ratio");       
+    DMLC_DECLARE_FIELD(max_img_size_).set_default(1e10f)
+        .describe("Maxmum image size");
+    DMLC_DECLARE_FIELD(min_img_size_).set_default(0.0f)
+        .describe("Minimum image size");
+    DMLC_DECLARE_FIELD(mirror_).set_default(false)
+        .describe("Whether to mirror the image");
+    DMLC_DECLARE_FIELD(rotate_).set_default(-1.0f)
+        .describe("Rotate angle");
+    DMLC_DECLARE_FIELD(fill_value_).set_default(255)
+        .describe("Filled value while padding");
+};
+
+/*! \brief helper class to do image augmentation */
+class ImageAugmenter {
+ public:
+  // contructor
+  ImageAugmenter(void)
+      : tmpres(false),
+        rotateM(2, 3, CV_32F) {
+  }
+  virtual ~ImageAugmenter() {
+  }
+  // TODO: Hack the shape and rotate list, didn't use param
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+    for (size_t i = 0; i < kwargs_left.size(); i++) {
+        if (!strcmp(kwargs_left[i].first.c_str(), "input_shape")) {
+          CHECK(sscanf(kwargs_left[i].second.c_str(), "%u,%u,%u", &shape_[0], &shape_[1], &shape_[2]) == 3)
+                       << "input_shape must be three consecutive integers without space example: 1,1,200 ";
+        }
+        if (!strcmp(kwargs_left[i].first.c_str(), "rotate_list")) {
+          char* val = kwargs_left[i].second.c_str();
+          const char *end = val + strlen(val);
+          char buf[128];
+          while (val < end) {
+            sscanf(val, "%[^,]", buf);
+            val += strlen(buf) + 1;
+            rotate_list_.push_back(atoi(buf));
+          }
+        }
+    }
+  }
+  /*!
+   * \brief augment src image, store result into dst
+   *   this function is not thread safe, and will only be called by one thread
+   *   however, it will tries to re-use memory space as much as possible
+   * \param src the source image
+   * \param source of random number
+   * \param dst the pointer to the place where we want to store the result
+   */
+  virtual cv::Mat Process(const cv::Mat &src,
+                          common::RANDOM_ENGINE *prnd) {
+    // shear
+    float s = common::NextDouble(prnd) * param_.max_shear_ratio_ * 2 - param_.max_shear_ratio_;
+    // rotate
+    int angle = common::NextUInt32(param_.max_rotate_angle_ * 2, prnd) - param_.max_rotate_angle_;
+    if (param_.rotate_ > 0) angle = param_.rotate_;
+    if (rotate_list_.size() > 0) {
+      angle = rotate_list_[NextUInt32(rotate_list_.size() - 1, prnd)];
+    }
+    float a = cos(angle / 180.0 * M_PI);
+    float b = sin(angle / 180.0 * M_PI);
+    // scale
+    float scale = NextDouble(prnd) * (param_.max_random_scale_ - param_.min_random_scale_) + param_.min_random_scale_;
+    // aspect ratio
+    float ratio = NextDouble(prnd) * param_.max_aspect_ratio_ * 2 - param_.max_aspect_ratio_ + 1;
+    float hs = 2 * scale / (1 + ratio);
+    float ws = ratio * hs;
+    // new width and height
+    float new_width = std::max(param_.min_img_size_, std::min(param_.max_img_size_, scale * src.cols));
+    float new_height = std::max(param_.min_img_size_, std::min(param_.max_img_size_, scale * src.rows));
+    //printf("%f %f %f %f %f %f %f %f %f\n", s, a, b, scale, ratio, hs, ws, new_width, new_height);
+    cv::Mat M(2, 3, CV_32F);
+    M.at<float>(0, 0) = hs * a - s * b * ws;
+    M.at<float>(1, 0) = -b * ws;
+    M.at<float>(0, 1) = hs * b + s * a * ws;
+    M.at<float>(1, 1) = a * ws;
+    float ori_center_width = M.at<float>(0, 0) * src.cols + M.at<float>(0, 1) * src.rows;
+    float ori_center_height = M.at<float>(1, 0) * src.cols + M.at<float>(1, 1) * src.rows;
+    M.at<float>(0, 2) = (new_width - ori_center_width) / 2;
+    M.at<float>(1, 2) = (new_height - ori_center_height) / 2;
+    cv::warpAffine(src, temp, M, cv::Size(new_width, new_height),
+                     cv::INTER_LINEAR,
+                     cv::BORDER_CONSTANT,
+                     cv::Scalar(param_.fill_value_, param_.fill_value_, param_.fill_value_));
+    cv::Mat res = temp;
+    if (param_.max_crop_size_ != -1 || param_.min_crop_size_ != -1){
+      CHECK(res.cols >= param_.max_crop_size_ && res.rows >= param_.max_crop_size_&& param_.max_crop_size_ >= param_.min_crop_size_)
+          << "input image size smaller than max_crop_size";
+      mshadow::index_t rand_crop_size = NextUInt32(param_.max_crop_size_- param_.min_crop_size_+1, prnd)+ param_.min_crop_size_;
+      mshadow::index_t y = res.rows - rand_crop_size;
+      mshadow::index_t x = res.cols - rand_crop_size;
+      if (rand_crop_ != 0) {
+        y = NextUInt32(y + 1, prnd);
+        x = NextUInt32(x + 1, prnd);
+      }
+      else {
+        y /= 2; x /= 2;
+      }
+      cv::Rect roi(x, y, rand_crop_size, rand_crop_size);
+      cv::resize(res(roi), res, cv::Size(shape_[1], shape_[2]));
+    }
+    else{
+      utils::Check(static_cast<mshadow::index_t>(res.cols) >= shape_[1] && static_cast<mshadow::index_t>(res.rows) >= shape_[2],
+        "input image size smaller than input shape");
+      mshadow::index_t y = res.rows - shape_[2];
+      mshadow::index_t x = res.cols - shape_[1];
+      if (param_.rand_crop_ != 0) {
+        y = NextUInt32(y + 1, prnd);
+        x = NextUInt32(x + 1, prnd);
+      }
+      else {
+        y /= 2; x /= 2;
+      }
+      cv::Rect roi(x, y, shape_[1], shape_[2]);
+      res = res(roi);
+    }
+    return res;
+  }
+  /*!
+   * \brief augment src image, store result into dst
+   *   this function is not thread safe, and will only be called by one thread
+   *   however, it will tries to re-use memory space as much as possible
+   * \param src the source image
+   * \param source of random number
+   * \param dst the pointer to the place where we want to store the result
+   */
+  virtual mshadow::Tensor<cpu, 3> Process(mshadow::Tensor<cpu, 3> data,
+                                          common::RANDOM_ENGINE *prnd) {
+    if (!NeedProcess()) return data;
+    cv::Mat res(data.size(1), data.size(2), CV_8UC3);
+    for (index_t i = 0; i < data.size(1); ++i) {
+      for (index_t j = 0; j < data.size(2); ++j) {
+        res.at<cv::Vec3b>(i, j)[0] = data[2][i][j];
+        res.at<cv::Vec3b>(i, j)[1] = data[1][i][j];
+        res.at<cv::Vec3b>(i, j)[2] = data[0][i][j];
+      }
+    }
+    res = this->Process(res, prnd);
+    tmpres.Resize(mshadow::Shape3(3, res.rows, res.cols));
+    for (index_t i = 0; i < tmpres.size(1); ++i) {
+      for (index_t j = 0; j < tmpres.size(2); ++j) {
+        cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
+        tmpres[0][i][j] = bgr[2];
+        tmpres[1][i][j] = bgr[1];
+        tmpres[2][i][j] = bgr[0];
+      }
+    }
+    return tmpres;
+  }
+
+  virtual void Process(unsigned char *dptr, size_t sz,
+                       mshadow::TensorContainer<cpu, 3> *p_data,
+                       common::RANDOM_ENGINE *prnd) {
+    cv::Mat buf(1, sz, CV_8U, dptr);
+    cv::Mat res = cv::imdecode(buf, 1);
+    res = this->Process(res, prnd);
+    p_data->Resize(mshadow::Shape3(3, res.rows, res.cols));
+    for (index_t i = 0; i < p_data->size(1); ++i) {
+      for (index_t j = 0; j < p_data->size(2); ++j) {
+        cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
+        (*p_data)[0][i][j] = bgr[2];
+        (*p_data)[1][i][j] = bgr[1];
+        (*p_data)[2][i][j] = bgr[0];
+      }
+    }
+    res.release();
+  }
+
+ private:
+  // whether skip processing
+  inline bool NeedProcess(void) const {
+    if (max_rotate_angle_ > 0 || max_shear_ratio_ > 0.0f
+        || rotate_ > 0 || rotate_list_.size() > 0) return true;
+    if (min_crop_size_ > 0 && max_crop_size_ > 0) return true;
+    return false;
+  }
+  // temp input space
+  mshadow::TensorContainer<cpu, 3> tmpres;
+  // temporal space
+  cv::Mat temp0, temp, temp2;
+  // rotation param
+  cv::Mat rotateM;
+  // parameters
+  /*! \brief input shape */
+  mshadow::Shape<4> shape_;
+  /*! \brief list of possible rotate angle */
+  std::vector<int> rotate_list_;
+};
+}  // namespace io
+}  // namespace cxxnet
+#endif
diff --git a/src/io/image_recordio.h b/src/io/image_recordio.h
new file mode 100644
index 000000000000..4aea8aabcb47
--- /dev/null
+++ b/src/io/image_recordio.h
@@ -0,0 +1,75 @@
+/*!
+ * \file image_recordio.h
+ * \brief image recordio struct
+ */
+#ifndef MXNET_IO_IMAGE_RECORDIO_H_
+#define MXNET_IO_IMAGE_RECORDIO_H_
+
+#include <dmlc/base.h>
+#include <dmlc/io.h>
+
+namespace mxnet {
+namespace io {
+/*! \brief image recordio struct */
+struct ImageRecordIO {
+  /*! \brief header in image recordio */
+  struct Header {
+    /*!
+     * \brief flag of the header,
+     *  used for future extension purposes
+     */
+    uint32_t flag;
+    /*!
+     * \brief label field that returns label of images
+     *  when image list was not presented,
+     * 
+     * NOTE: user do not need to repack recordio just to
+     * change label field, just supply a list file that
+     * maps image id to new labels
+     */
+    float label;
+    /*!
+     * \brief unique image index
+     *  image_id[1] is always set to 0,
+     *  reserved for future purposes for 128bit id
+     *  image_id[0] is used to store image id
+     */
+    uint64_t image_id[2];
+  };
+  /*! \brief header of image recordio */
+  Header header;
+  /*! \brief pointer to data content */
+  uint8_t *content;
+  /*! \brief size of the content */
+  size_t content_size;
+  /*! \brief constructor */
+  ImageRecordIO(void)
+      : content(NULL), content_size(0) {
+    memset(&header, 0, sizeof(header));
+  }
+  /*! \brief get image id from record */
+  inline uint64_t image_index(void) const {
+    return header.image_id[0];
+  }
+  /*!
+   * \brief load header from a record content 
+   * \param buf the head of record
+   * \param size the size of the entire record   
+   */
+  inline void Load(void *buf, size_t size) {
+    CHECK(size >= sizeof(header));
+    std::memcpy(&header, buf, sizeof(header));
+    content = reinterpret_cast<uint8_t*>(buf) + sizeof(header);
+    content_size = size - sizeof(header);
+  }
+  /*!
+   * \brief save the record header
+   */
+  inline void SaveHeader(std::string *blob) const {
+    blob->resize(sizeof(header));
+    std::memcpy(dmlc::BeginPtr(*blob), &header, sizeof(header));    
+  }  
+}; 
+}  // namespace io
+}  // namespace mxnet
+#endif  // MXNET_IO_IMAGE_RECORDIO_H_
diff --git a/src/io/iter_batch.h b/src/io/iter_batch.h
new file mode 100644
index 000000000000..a0e4ab7e7ba5
--- /dev/null
+++ b/src/io/iter_batch.h
@@ -0,0 +1,162 @@
+/*!
+ * \file iter_batch_proc-inl.hpp
+ * \brief definition of preprocessing iterators that takes an iterator and do some preprocessing
+ * \author Tianqi Chen
+ */
+#ifndef MXNET_IO_ITER_BATCH_H_
+#define MXNET_IO_ITER_BATCH_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <dmlc/logging.h>
+#include <mshadow/tensor.h>
+
+namespace mxnet {
+namespace io {
+// Batch parameters
+struct BatchParam : public dmlc::Parameter<BatchParam> {
+  /*! \brief label width */
+  index_t batch_size_;
+  /*! \brief label width */
+  index_t label_width_;
+  /*! \brief use round roubin to handle overflow batch */
+  bool round_batch_;
+  /*! \brief skip read */
+  bool test_skipread_;
+  /*! \brief silent */
+  bool silent_;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(BatchParam) {
+    DMLC_DECLARE_FIELD(batch_size_).set_default(1)
+        .describe("Batch size.");
+    DMLC_DECLARE_FIELD(label_width_).set_default(1)
+        .describe("Label width.");
+    DMLC_DECLARE_FIELD(round_batch_).set_default(false)
+        .describe("Use round robin to handle overflow batch.");
+    DMLC_DECLARE_FIELD(test_skipread_).set_default(false)
+        .describe("Skip read for testing.");
+    DMLC_DECLARE_FIELD(silent_).set_default(false)
+        .describe("Whether to print batch information.")
+  }
+};
+    
+/*! \brief create a batch iterator from single instance iterator */
+class BatchAdaptIter: public IIterator<DataBatch> {
+public:
+  BatchAdaptIter(IIterator<DataInst> *base): base_(base) {
+    num_overflow_ = 0;
+  }
+  virtual ~BatchAdaptIter(void) {
+    delete base_;
+    out_.FreeSpaceDense();
+  }
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    // init batch param, it could have similar param with 
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+    for (size_t i = 0; i < kwargs_left.size(); i++) {
+      if (!strcmp(kwargs_left[i].first.c_str(), "input_shape")) {
+        CHECK(sscanf(kwargs_left[i].second.c_str(), "%u,%u,%u", &shape_[1], &shape_[2], &shape_[3]) == 3)
+          << "input_shape must be three consecutive integers without space example: 1,1,200 ")
+      }
+    }
+    // init base iterator
+    base_->Init(kwargs);
+    mshadow::Shape<4> tshape = shape_;
+    tshape[0] = param_.batch_size_;
+    AllocSpaceDense(false);
+  }
+  virtual void BeforeFirst(void) {
+    if (param_.round_batch_ == 0 || num_overflow_ == 0) {
+      // otherise, we already called before first
+      base_->BeforeFirst();
+    } else {
+      num_overflow_ = 0;
+    }
+    head_ = 1;
+  }
+  virtual bool Next(void) {
+    out_.num_batch_padd = 0;
+
+    // skip read if in head version
+    if (param_.test_skipread_ != 0 && head_ == 0) return true;
+    else this->head_ = 0;
+
+    // if overflow from previous round, directly return false, until before first is called
+    if (num_overflow_ != 0) return false;
+    index_t top = 0;
+
+    while (base_->Next()) {
+      const DataInst& d = base_->Value();
+      mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 2, float>());
+      out_.inst_index[top] = d.index;
+      mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 4, float>());
+
+      if (++ top >= param_.batch_size_) {
+        out.data[0] = TBlob(data);
+        out.data[1] = TBlob(label);
+        return true;
+      }
+    }
+    if (top != 0) {
+      if (param_.round_batch_ != 0) {
+        num_overflow_ = 0;
+        base_->BeforeFirst();
+        for (; top < param_.batch_size_; ++top, ++num_overflow_) {
+          CHECK(base_->Next()) << "number of input must be bigger than batch size";
+          const DataInst& d = base_->Value();
+          mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 2, float>());
+          out_.inst_index[top] = d.index;
+          mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 4, float>());
+        }
+        out_.num_batch_padd = num_overflow_;
+      } else {
+        out_.num_batch_padd = batch_size_ - top;
+      }
+      out.data[0] = TBlob(data);
+      out.data[1] = TBlob(label);
+      return true;
+    }
+    return false;
+  }
+  virtual const DataBatch &Value(void) const {
+    CHECK(head_ == 0) << "must call Next to get value";
+    return out_;
+  }
+private:
+  /*! \brief base iterator */
+  IIterator<DataInst> *base_;
+  /*! \brief input shape */
+  mshadow::Shape<4> shape_;
+  /*! \brief output data */
+  DataBatch out_;
+  /*! \brief on first */
+  int head_;
+  /*! \brief number of overflow instances that readed in round_batch mode */
+  int num_overflow_;
+  /*! \brief label information of the data*/
+  mshadow::Tensor<mshadow::cpu, 2> label;
+  /*! \brief content of dense data, if this DataBatch is dense */
+  mshadow::Tensor<mshadow::cpu, 4> data;
+  // Functions that allocate and free tensor space
+  inline void AllocSpaceDense(bool pad = false) { 
+    data = mshadow::NewTensor<mshadow::cpu>(shape_, 0.0f, pad);
+    mshadow::Shape<2> lshape = mshadow::Shape2(batch_size, label_width);
+    label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
+    out_.inst_index = new unsigned[batch_size];
+    out_.batch_size = batch_size;
+    out_.data.resize(2);
+  }
+  /*! \brief auxiliary function to free space, if needed, dense only */
+  inline void FreeSpaceDense(void) {
+    if (label.dptr_ != NULL) {
+      delete [] inst_index;
+      mshadow::FreeSpace(&label);
+      mshadow::FreeSpace(&data);
+      label.dptr_ = NULL;
+    }
+  }
+}; // class BatchAdaptIter
+}  // namespace io
+}  // namespace cxxnet
+#endif  // MXNET_IO_ITER_BATCH_H_
\ No newline at end of file
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
new file mode 100644
index 000000000000..2ab1aa8958cb
--- /dev/null
+++ b/src/io/iter_image_recordio.cc
@@ -0,0 +1,369 @@
+/*!
+ * \file iter_image_recordio-inl.hpp
+ * \brief recordio data
+iterator
+ */
+#include <cstdlib>
+#include <dmlc/base.h>
+#include <dmlc/io.h>
+#include <dmlc/omp.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <dmlc/recordio.h>
+#include <dmlc/threadediter.h>
+#include <unordered_map>
+#include <vector>
+#include "./inst_vector.h"
+#include "./image_recordio.h"
+#include "./image_augmenter.h"
+#include "../utils/decoder.h"
+namespace mxnet {
+namespace io {
+/*! \brief data structure to hold labels for images */
+class ImageLabelMap {
+ public:
+  /*!
+   * \brief initialize the label list into memory
+   * \param path_imglist path to the image list
+   * \param label_width predefined label_width
+   */
+  explicit ImageLabelMap(const char *path_imglist,
+                         mshadow::index_t label_width,
+                         bool silent) {
+    label_width_ = label_width;
+    image_index_.clear();
+    label_.clear();
+    idx2label_.clear();
+    dmlc::InputSplit *fi = dmlc::InputSplit::Create
+        (path_imglist, 0, 1, "text");
+    dmlc::InputSplit::Blob rec;
+    while (fi->NextRecord(&rec)) {
+      // quick manual parsing
+      char *p = reinterpret_cast<char*>(rec.dptr);
+      char *end = p + rec.size;
+      // skip space
+      while (isspace(*p) && p != end) ++p;
+      image_index_.push_back(static_cast<size_t>(atol(p)));
+      for (size_t i = 0; i < label_width_; ++i) {
+        // skip till space
+        while (!isspace(*p) && p != end) ++p;
+        // skip space
+        while (isspace(*p) && p != end) ++p;
+        CHECK(p != end) << "Bad ImageList format";
+        label_.push_back(static_cast<real_t>(atof(p)));
+      }
+    }
+    delete fi;
+    // be careful not to resize label_ afterwards
+    idx2label_.reserve(image_index_.size());
+    for (size_t i = 0; i < image_index_.size(); ++i) {
+      idx2label_[image_index_[i]] = BeginPtr(label_) + i * label_width_;
+    }
+    if (!silent) {
+      LOG(INFO) << "Loaded ImageList from " << path_imglist << ' '
+                << image_index_.size() << " Image records";
+    }
+  }
+  /*! \brief find a label for corresponding index */
+  inline mshadow::Tensor<cpu, 1> Find(size_t imid) const {
+    std::unordered_map<size_t, real_t*>::const_iterator it
+        = idx2label_.find(imid);
+    CHECK(it != idx2label_.end()) << "fail to find imagelabel for id " << imid;
+    return mshadow::Tensor<cpu, 1>(it->second, mshadow::Shape1(label_width_));
+  }
+
+ private:
+  // label with_
+  mshadow::index_t label_width_;
+  // image index of each record
+  std::vector<size_t> image_index_;
+  // real label content
+  std::vector<real_t> label_;
+  // map index to label
+  std::unordered_map<size_t, real_t*> idx2label_;
+};
+
+// Define image record parser parameters
+struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
+  /*! \brief path to image list */
+  std::string path_imglist_;
+  /*! \brief path to image recordio */
+  std::string path_imgrec_;
+  /*! \brief number of threads */
+  int nthread_;
+  /*! \brief whether to remain silent */
+  bool silent_;
+  /*! \brief number of distributed worker */
+  int dist_num_worker_, dist_worker_rank_;
+  /*! \brief label-width */
+  int label_width_;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageRecParserParam) {
+    DMLC_DECLARE_FIELD(path_imglist_).set_default("")
+        .describe("Path to image list.");
+    DMLC_DECLARE_FIELD(path_imagrec_).set_default("./data/imgrec.rec")
+        .describe("Path to image record file.");
+    DMLC_DECLARE_FIELD(nthread_).set_lower_bound(1).set_default(4)
+        .describe("Number of thread to do parsing.");
+    DMLC_DECLARE_FIELD(label_width_).set_lower_bound(1).set_default(1)
+        .describe("How many labels for an image.");
+    DMLC_DECLARE_FIELD(silent_).set_default(false)
+        .describe("Whether to output parser information.");
+    DMLC_DECLARE_FIELD(dist_num_worker_).set_lower_bound(1).set_default(1)
+        .describe("Dist worker number.");
+    DMLC_DECLARE_FIELD(dist_worker_rank_).set_default(0)
+        .describe("Dist worker rank.");
+  }
+};
+
+// parser to parse image recordio
+class ImageRecordIOParser {
+ public:
+  ImageRecordIOParser(void)
+      : source_(NULL),
+        label_map_(NULL) {
+  }
+  ~ImageRecordIOParser(void) {
+    // can be NULL
+    delete label_map_;
+    delete source_;
+    for (size_t i = 0; i < augmenters_.size(); ++i) {
+      delete augmenters_[i];
+    }
+    for (size_t i = 0; i < prnds_.size(); ++i) {
+      delete prnds_[i];
+    }
+  }
+  // initialize the parser
+  inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs);
+  
+  // set record to the head
+  inline void BeforeFirst(void) {
+    return source_->BeforeFirst();
+  }
+  // parse next set of records, return an array of
+  // instance vector to the user
+  inline bool ParseNext(std::vector<InstVector> *out);
+ private:
+  // magic nyumber to see prng
+  static const int kRandMagic = 111;
+  /*! \brief parameters */
+  ImageRecParserParam param_; 
+  /*! \brief augmenters */
+  std::vector<ImageAugmenter*> augmenters_;
+  /*! \brief random samplers */
+  std::vector<common::RANDOM_ENGINE*> prnds_;
+  /*! \brief data source */
+  dmlc::InputSplit *source_;
+  /*! \brief label information, if any */
+  ImageLabelMap *label_map_;
+};
+
+inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+  // initialize parameter
+  std::vector<std::pair<std::string, std::string> > kwargs_left;
+  // init image rec param
+  kwargs_left = param_.InitAllowUnknown(kwargs);
+  int maxthread, threadget;
+  #pragma omp parallel
+  {
+    maxthread = std::max(omp_get_num_procs() / 2 - 1, 1);
+  }
+  param_.nthread_ = std::min(maxthread, param_.nthread_);
+  #pragma omp parallel num_threads(param_.nthread_)
+  {
+    threadget = omp_get_num_threads();
+  }
+  param_.nthread_ = threadget;
+  // setup decoders
+  for (int i = 0; i < threadget; ++i) {
+    augmenters_.push_back(new ImageAugmenter());
+    augmenters_[i].init(kwargs_left);
+    prnds_.push_back(new common::RANDOM_ENGINE((i + 1) * kRandMagic));
+  }
+  
+  // handling for hadoop
+  // TODO, hack
+  const char *ps_rank = getenv("PS_RANK");
+  if (ps_rank != NULL) {
+    param_.dist_worker_rank = atoi(ps_rank);
+  }
+
+  if (param_.path_imglist_.length() != 0) {
+    label_map_ = new ImageLabelMap(param_.path_imglist_.c_str(),
+                                   param_.label_width_, silent_ != 0);
+  } else {
+    param_.label_width_ = 1;
+  }
+  CHECK(path_imgrec_.length() != 0)
+    << "ImageRecordIOIterator: must specify image_rec";
+#if MSHADOW_DIST_PS
+    // TODO move to a better place
+    param_.dist_num_worker_ = ::ps::RankSize();
+    param_.dist_worker_rank_ = ::ps::MyRank();
+    LOG(INFO) << "rank " << param_.dist_worker_rank_
+              << " in " << param_.dist_num_worker_;
+#endif
+  source_ = dmlc::InputSplit::Create
+      (param_.path_imgrec_.c_str(), param_.dist_worker_rank_,
+       param_.dist_num_worker_, "recordio");
+  // use 64 MB chunk when possible
+  source_->HintChunkSize(8 << 20UL);
+}
+
+inline bool ImageRecordIOParser::
+ParseNext(std::vector<InstVector> *out_vec) {
+  CHECK(source_ != NULL);
+  dmlc::InputSplit::Blob chunk;
+  if (!source_->NextChunk(&chunk)) return false;
+  out_vec->resize(param_.nthread_);
+  #pragma omp parallel num_threads(param_.nthread_)
+  {
+    CHECK(omp_get_num_threads() == param_.nthread_);
+    int tid = omp_get_thread_num();
+    dmlc::RecordIOChunkReader reader(chunk, tid, parser_.nthread_);
+    mxnet::ImageRecordIO rec;
+    dmlc::InputSplit::Blob blob;
+    // image data
+    InstVector &out = (*out_vec)[tid];
+    out.Clear();
+    while (reader.NextRecord(&blob)) {
+      // result holder
+      cv::Mat res;
+      rec.Load(blob.dptr, blob.size);
+      cv::Mat buf(1, rec.content_size, CV_8U, rec.content);
+      res = cv::imdecode(buf, 1);
+      res = augmenters_[tid]->Process(res, prnds_[tid]);
+      out.Push(static_cast<unsigned>(rec.image_index()),
+               mshadow::Shape3(3, res.rows, res.cols),
+               mshadow::Shape1(param_.label_width_));
+      DataInst inst = out.Back();
+      for (int i = 0; i < res.rows; ++i) {
+        for (int j = 0; j < res.cols; ++j) {
+          cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
+          inst.data[0][i][j] = bgr[2];
+          inst.data[1][i][j] = bgr[1];
+          inst.data[2][i][j] = bgr[0];
+        }
+      }
+      if (label_map_ != NULL) {
+        mshadow::Copy(inst.label, label_map_->Find(rec.image_index()));
+      } else {
+        inst.label[0] = rec.header.label;
+      }
+      res.release();
+    }
+  }
+  return true;
+}
+
+// Define image record parameters
+struct ImageRecordParam: public dmlc::Parameter<ImageRecordParam> {
+  /*! \brief whether to do shuffle */
+  bool shuffle;
+  /*! \brief random seed */
+  int seed;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageRecordParam) {
+    DMLC_DECLARE_FIELD(shuffle).set_default(true)
+        .describe("Whether to shuffle data.");
+    DMLC_DECLARE_FIELD(seed).set_default(0)
+        .describe("Random Seed.");
+  }
+};
+
+// iterator on image recordio
+class ImageRecordIter : public IIterator<DataInst> {
+ public:
+  ImageRecordIter()
+      : data_(NULL) {
+  }
+  virtual ~ImageRecordIter(void) {
+    iter_.Destroy();
+    // data can be NULL
+    delete data_;
+  }
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    // init image rec param
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+    // use the left kwarg to init parser
+    parser_.Init(kwargs_left);
+    // init thread iter
+    iter_.set_max_capacity(4);
+    iter_.Init([this](std::vector<InstVector> **dptr) {
+        if (*dptr == NULL) {
+          *dptr = new std::vector<InstVector>();
+        }
+        return parser_.ParseNext(*dptr);
+      },
+      [this]() { parser_.BeforeFirst(); });
+    inst_ptr_ = 0;
+  }
+  virtual void BeforeFirst(void) {
+    iter_.BeforeFirst();
+    inst_order_.clear();
+    inst_ptr_ = 0;
+  }
+  virtual bool Next(void) {
+    while (true) {
+      if (inst_ptr_ < inst_order_.size()) {
+        std::pair<unsigned, unsigned> p = inst_order_[inst_ptr_];
+        out_ = (*data_)[p.first][p.second];
+        ++inst_ptr_;
+        return true;
+      } else {
+        if (data_ != NULL) iter_.Recycle(&data_);
+        if (!iter_.Next(&data_)) return false;
+        inst_order_.clear();
+        for (unsigned i = 0; i < data_->size(); ++i) {
+          const InstVector &tmp = (*data_)[i];
+          for (unsigned j = 0; j < tmp.Size(); ++j) {
+            inst_order_.push_back(std::make_pair(i, j));
+          }
+        }
+        // shuffle instance order if needed
+        if (shuffle_ != 0) {
+            std::shuffle(inst_order_.begin(), inst_.end(), common::RANDOM_ENGINE(kRandMagic + param_.seed));
+        }
+        inst_ptr_ = 0;
+      }
+    }
+    return false;
+  }
+  virtual const DataInst &Value(void) const {
+    return out_;
+  }
+
+ private:
+  // random magic
+  static const int kRandMagic = 111;
+  // output instance
+  DataInst out_;
+  // whether shuffle data
+  int shuffle_;
+  // data ptr
+  size_t inst_ptr_;
+  // internal instance order
+  std::vector<std::pair<unsigned, unsigned> > inst_order_;
+  // data
+  std::vector<InstVector> *data_;
+  // internal parser
+  ImageRecordIOParser parser_;
+  // backend thread
+  dmlc::ThreadedIter<std::vector<InstVector> > iter_;
+  // parameters
+  ImageRecParserParam param_;
+};
+DMLC_REGISTER_PARAMETER(ImageRecParserParam);
+DMLC_REGISTER_PARAMETER(ImageRecordParam);
+MXNET_REGISTER_IO_ITER(MNISTIter, MNISTIter)
+MXNET_REGISTER_IO_CHAINED_ITER(ImageRecordIter, ImageRecordIter, BatchAdaptIter)
+    .describe("Create iterator for dataset packed in recordio.")
+    .add_arguments(ImageRecordParam::__FIELDS__())
+    .add_arguments(ImageRecParserParam::__FIELDS__())
+    .add_arguments(BatchParam::__FIELDS__())
+    .add_arguments(ImageAugmenterParam::__FIELDS__());
+}  // namespace io
+}  // namespace mxnet
+#endif  // ITER_IMAGE_RECORDIO_INL_HPP_
diff --git a/src/utils/decoder.h b/src/utils/decoder.h
new file mode 100644
index 000000000000..17203392cc60
--- /dev/null
+++ b/src/utils/decoder.h
@@ -0,0 +1,128 @@
+#ifndef MXNET_UTILS_DECODER_H_
+#define MXNET_UTILS_DECODER_H_
+
+#include <vector>
+#if MXNET_USE_OPENCV_DECODER == 0
+  #include <jpeglib.h>
+  #include <setjmp.h>
+  #include <jerror.h>
+#endif
+#include <dmlc/logging.h>
+#include <mshadow/tensor.h>
+#if MXNET_USE_OPENCV
+  #include <opencv2/opencv.hpp>
+#endif
+
+namespace cxxnet {
+namespace utils {
+
+#if MXNET_USE_OPENCV_DECODER == 0
+struct JpegDecoder {
+public:
+  JpegDecoder(void) {
+    cinfo.err = jpeg_std_error(&jerr.base);
+    jerr.base.error_exit = jerror_exit;
+    jerr.base.output_message = joutput_message;
+    jpeg_create_decompress(&cinfo);
+  }
+  // destructor
+  ~JpegDecoder(void) {
+    jpeg_destroy_decompress(&cinfo);
+  }
+
+  inline void Decode(unsigned char *ptr, size_t sz,
+                     mshadow::TensorContainer<cpu, 3, unsigned char> *p_data) {
+    if(setjmp(jerr.jmp)) {
+      jpeg_destroy_decompress(&cinfo);
+      dmlc::Error("Libjpeg fail to decode");
+    }
+    this->jpeg_mem_src(&cinfo, ptr, sz);
+    CHECK(jpeg_read_header(&cinfo, TRUE) == JPEG_HEADER_OK) << "libjpeg: failed to decode";
+    CHECK(jpeg_start_decompress(&cinfo) == true) << "libjpeg: failed to decode";
+    p_data->Resize(mshadow::Shape3(cinfo.output_height, cinfo.output_width, cinfo.output_components));
+    JSAMPROW jptr = &((*p_data)[0][0][0]);
+    while (cinfo.output_scanline < cinfo.output_height) {
+      CHECK(jpeg_read_scanlines(&cinfo, &jptr, 1) == true) << "libjpeg: failed to decode";
+      jptr += cinfo.output_width * cinfo.output_components;
+    }
+    CHECK(jpeg_finish_decompress(&cinfo) == true) << "libjpeg: failed to decode");
+  }
+private:
+  struct jerror_mgr {
+    jpeg_error_mgr base;
+    jmp_buf jmp;
+  };
+
+  METHODDEF(void) jerror_exit(j_common_ptr jinfo) {
+    jerror_mgr* err = (jerror_mgr*)jinfo->err;
+    longjmp(err->jmp, 1);
+  }
+
+  METHODDEF(void) joutput_message(j_common_ptr) {}
+
+  static boolean mem_fill_input_buffer_ (j_decompress_ptr cinfo) {
+    dmlc::Error("JpegDecoder: bad jpeg image");
+    return true;
+  }
+
+  static void mem_skip_input_data_ (j_decompress_ptr cinfo, long num_bytes_) {
+    jpeg_source_mgr *src = cinfo->src;
+    size_t num_bytes = static_cast<size_t>(num_bytes_);
+    if (num_bytes > 0) {
+      src->next_input_byte += num_bytes;
+      CHECK(src->bytes_in_buffer >= num_bytes) << "fail to decode";
+      src->bytes_in_buffer -= num_bytes;
+    } else {
+      dmlc::Error("JpegDecoder: bad jpeg image");
+
+    }
+  }
+
+  static void mem_term_source_ (j_decompress_ptr cinfo) {}
+  static void mem_init_source_ (j_decompress_ptr cinfo) {}
+  static boolean jpeg_resync_to_restart_(j_decompress_ptr cinfo, int desired) {
+    dmlc::Error("JpegDecoder: bad jpeg image");
+    return true;
+  }
+  void jpeg_mem_src (j_decompress_ptr cinfo, void* buffer, long nbytes) {
+    src.init_source = mem_init_source_;
+    src.fill_input_buffer = mem_fill_input_buffer_;
+    src.skip_input_data = mem_skip_input_data_;
+    src.resync_to_restart = jpeg_resync_to_restart_;
+    src.term_source = mem_term_source_;
+    src.bytes_in_buffer = nbytes;
+    src.next_input_byte = static_cast<JOCTET*>(buffer);
+    cinfo->src = &src;
+  }
+
+private:
+  jpeg_decompress_struct cinfo;
+  jpeg_source_mgr src;
+  jerror_mgr jerr;
+};
+#endif
+
+#if MXNET_USE_OPENCV
+struct OpenCVDecoder {
+  void Decode(unsigned char *ptr, size_t sz, mshadow::TensorContainer<cpu, 3, unsigned char> *p_data) {
+    cv::Mat buf(1, sz, CV_8U, ptr);
+    cv::Mat res = cv::imdecode(buf, 1);
+    CHECK(res.data != NULL) << "decoding fail";
+    p_data->Resize(mshadow::Shape3(res.rows, res.cols, 3));
+    for (int y = 0; y < res.rows; ++y) {
+      for (int x = 0; x < res.cols; ++x) {
+        cv::Vec3b bgr = res.at<cv::Vec3b>(y, x);
+        // store in RGB order
+        (*p_data)[y][x][2] = bgr[0];
+        (*p_data)[y][x][1] = bgr[1];
+        (*p_data)[y][x][0] = bgr[2];
+      }
+    }
+    res.release();
+  }
+};
+#endif
+} // namespace utils
+} // namespace mxnet
+
+#endif // DECODER_H
diff --git a/src/utils/io.h b/src/utils/io.h
new file mode 100644
index 000000000000..3781ce98b012
--- /dev/null
+++ b/src/utils/io.h
@@ -0,0 +1,175 @@
+#ifndef CXXNET_UTILS_IO_H_
+#define CXXNET_UTILS_IO_H_
+/*!
+ * \file io.h
+ * \brief definition of abstract stream interface for IO
+ * \author Bing Xu Tianqi Chen
+ */
+#include "./utils.h"
+#include <dmlc/io.h>
+#include <string>
+#include <algorithm>
+#include <cstring>
+
+namespace cxxnet {
+namespace utils {
+typedef dmlc::Stream IStream;
+typedef dmlc::SeekStream ISeekStream;
+
+/*! \brief a in memory buffer that can be read and write as stream interface */
+struct MemoryBufferStream : public ISeekStream {
+ public:
+  MemoryBufferStream(std::string *p_buffer)
+      : p_buffer_(p_buffer) {
+    curr_ptr_ = 0;
+  }
+  virtual ~MemoryBufferStream(void) {}
+  virtual size_t Read(void *ptr, size_t size) {
+    CHECK(curr_ptr_ <= p_buffer_->length())
+          << " read can not have position excceed buffer length";
+    size_t nread = std::min(p_buffer_->length() - curr_ptr_, size);
+    if (nread != 0) memcpy(ptr, &(*p_buffer_)[0] + curr_ptr_, nread);
+    curr_ptr_ += nread;
+    return nread;
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    if (size == 0) return;
+    if (curr_ptr_ + size > p_buffer_->length()) {
+      p_buffer_->resize(curr_ptr_+size);
+    }
+    memcpy(&(*p_buffer_)[0] + curr_ptr_, ptr, size);
+    curr_ptr_ += size;
+  }
+  virtual void Seek(size_t pos) {
+    curr_ptr_ = static_cast<size_t>(pos);
+  }
+  virtual size_t Tell(void) {
+    return curr_ptr_;
+  }
+
+ private:
+  /*! \brief in memory buffer */
+  std::string *p_buffer_;
+  /*! \brief current pointer */
+  size_t curr_ptr_;
+}; // class MemoryBufferStream
+
+/*! \brief implementation of file i/o stream */
+class StdFile: public ISeekStream {
+ public:
+  /*! \brief constructor */
+  StdFile(const char *fname, const char *mode) {
+    Open(fname, mode);
+  }
+  StdFile() {}
+  virtual ~StdFile(void) {
+    this->Close();
+  }
+  virtual void Open(const char *fname, const char *mode) {
+    fp_ = utils::FopenCheck(fname, mode);
+    fseek(fp_, 0L, SEEK_END);
+    sz_ = ftell(fp_);
+    fseek(fp_, 0L, SEEK_SET);
+  }
+  virtual size_t Read(void *ptr, size_t size) {
+    return fread(ptr, size, 1, fp_);
+  }
+  virtual void Write(const void *ptr, size_t size) {
+    fwrite(ptr, size, 1, fp_);
+  }
+  virtual void Seek(size_t pos) {
+    fseek(fp_, pos, SEEK_SET);
+  }
+  virtual size_t Tell(void) {
+    return static_cast<size_t>(ftell(fp_));
+  }
+  inline void Close(void) {
+    if (fp_ != NULL){
+      fclose(fp_); fp_ = NULL;
+    }
+  }
+  inline size_t Size() {
+    return sz_;
+  }
+ private:
+  FILE *fp_;
+  size_t sz_;
+}; // class StdFile
+
+/*! \brief Basic page class */
+class BinaryPage {
+ public:
+  /*! \brief page size 64 MB */
+  static const size_t kPageSize = 64 << 18;
+ public:
+  /*! \brief memory data object */
+  struct Obj{
+    /*! \brief pointer to the data*/
+    void  *dptr;
+    /*! \brief size */
+    size_t sz;
+    Obj(void * dptr, size_t sz) : dptr(dptr), sz(sz){}
+  };
+ public:
+  /*! \brief constructor of page */
+  BinaryPage(void)  {
+    data_ = new int[kPageSize];
+    utils::Check(data_ != NULL, "fail to allocate page, out of space");
+    this->Clear();
+  };
+  ~BinaryPage() {
+    if (data_) delete [] data_;
+  }
+  /*!
+   * \brief load one page form instream
+   * \return true if loading is successful
+   */
+  inline bool Load(utils::IStream &fi) {
+    return fi.Read(&data_[0], sizeof(int)*kPageSize) !=0;
+  }
+  /*! \brief save one page into outstream */
+  inline void Save(utils::IStream &fo) {
+    fo.Write(&data_[0], sizeof(int)*kPageSize);
+  }
+  /*! \return number of elements */
+  inline int Size(void){
+    return data_[0];
+  }
+  /*! \brief Push one binary object into page
+   *  \param fname file name of obj need to be pushed into
+   *  \return false or true to push into
+   */
+  inline bool Push(const Obj &dat) {
+    if(this->FreeBytes() < dat.sz + sizeof(int)) return false;
+    data_[ Size() + 2 ] = data_[ Size() + 1 ] + dat.sz;
+    memcpy(this->offset(data_[ Size() + 2 ]), dat.dptr, dat.sz);
+    ++ data_[0];
+    return true;
+  }
+  /*! \brief Clear the page */
+  inline void Clear(void) {
+    memset(&data_[0], 0, sizeof(int) * kPageSize);
+  }
+  /*!
+   * \brief Get one binary object from page
+   *  \param r r th obj in the page
+   */
+  inline Obj operator[](int r) {
+    CHECK(r < Size());
+    return Obj(this->offset(data_[ r + 2 ]),  data_[ r + 2 ] - data_[ r + 1 ]);
+  }
+ private:
+  /*! \return number of elements */
+  inline size_t FreeBytes(void) {
+    return (kPageSize - (Size() + 2)) * sizeof(int) - data_[ Size() + 1 ];
+  }
+  inline void* offset(int pos) {
+    return (char*)(&data_[0]) + (kPageSize*sizeof(int) - pos);
+  }
+ private:
+  //int data_[ kPageSize ];
+  int *data_;
+};  // class BinaryPage
+}  // namespace utils
+}  // namespace cxxnet
+#endif
diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h
new file mode 100644
index 000000000000..7df1ae17aa56
--- /dev/null
+++ b/src/utils/thread_buffer.h
@@ -0,0 +1,205 @@
+#ifndef CXXNET_UTILS_THREAD_BUFFER_H_
+#define CXXNET_UTILS_THREAD_BUFFER_H_
+/*!
+ * \file thread_buffer.h
+ * \brief  multi-thread buffer, iterator, can be used to create parallel pipeline
+ * \author Tianqi Chen
+ */
+#include <vector>
+#include <cstring>
+#include <cstdlib>
+#include "./utils.h"
+#include "./thread.h"
+namespace cxxnet {
+namespace utils {
+/*!
+ * \brief buffered loading iterator that uses multithread
+ * this template method will assume the following paramters
+ * \tparam Elem elememt type to be buffered
+ * \tparam ElemFactory factory type to implement in order to use thread buffer
+ */
+template<typename Elem, typename ElemFactory>
+class ThreadBuffer {
+ public:
+  /*!\brief constructor */
+  ThreadBuffer(void) {
+    this->init_end = false;
+    this->buf_size = 30;
+  }
+  ~ThreadBuffer(void) {
+    if(init_end) this->Destroy();
+  }
+  /*!\brief set parameter, will also pass the parameter to factory */
+  inline void SetParam(const char *name, const char *val) {
+    if (!strcmp( name, "buffer_size")) buf_size = atoi(val);
+    factory.SetParam(name, val);
+  }
+  /*!
+   * \brief initalize the buffered iterator
+   * \param param a initialize parameter that will pass to factory, ignore it if not necessary
+   * \return false if the initlization can't be done, e.g. buffer file hasn't been created 
+   */
+  inline bool Init(void) {
+    if (!factory.Init()) return false;
+    bufA.reserve(buf_size);
+    bufB.reserve(buf_size);
+    for (int i = 0; i < buf_size; ++i) {
+      bufA.push_back(factory.Create());
+      bufB.push_back(factory.Create());
+    }
+    this->init_end = true;
+    this->StartLoader();
+    return true;
+  }  
+  /*!\brief place the iterator before first value */
+  inline void BeforeFirst(void) {
+    // wait till last loader end
+    loading_end.Wait();
+    // critcal zone
+    current_buf = 1;
+    factory.BeforeFirst();
+    // reset terminate limit
+    endA = endB = buf_size;
+    // wake up loader for first part
+    loading_need.Post();
+    // wait til first part is loaded
+    loading_end.Wait();
+    // set current buf to right value
+    current_buf = 0;
+    // wake loader for next part
+    data_loaded = false;
+    loading_need.Post();
+    // set buffer value
+    buf_index = 0;
+  }  
+  /*! \brief destroy the buffer iterator, will deallocate the buffer */
+  inline void Destroy(void) {
+    // wait until the signal is consumed
+    this->destroy_signal = true;
+    loading_need.Post();
+    loader_thread.Join();
+    loading_need.Destroy();
+    loading_end.Destroy();    
+    for (size_t i = 0; i < bufA.size(); ++i) {
+      factory.FreeSpace(bufA[i]);
+    }
+    for (size_t i = 0; i < bufB.size(); ++i) {
+      factory.FreeSpace(bufB[i]);
+    }
+    bufA.clear(); bufB.clear();
+    factory.Destroy();
+    this->init_end = false;
+  }  
+  /*!
+   * \brief get the next element needed in buffer
+   * \param elem element to store into
+   * \return whether reaches end of data
+   */
+  inline bool Next(Elem &elem) {
+    // end of buffer try to switch
+    if (buf_index == buf_size) {
+      this->SwitchBuffer();
+      buf_index = 0;
+    }
+    if (buf_index >= (current_buf ? endA : endB)) { 
+      return false;
+    }
+    std::vector<Elem> &buf = current_buf ? bufA : bufB;
+    elem = buf[buf_index];
+    ++buf_index;
+    return true;
+  }      
+  /*!
+   * \brief get the factory object
+   */
+  inline ElemFactory &get_factory(void) {
+    return factory;
+  }
+  inline const ElemFactory &get_factory(void) const{
+    return factory;
+  }
+  // size of buffer
+  int  buf_size;
+ private:
+  // factory object used to load configures
+  ElemFactory factory;
+  // index in current buffer
+  int buf_index;
+  // indicate which one is current buffer
+  int current_buf;
+  // max limit of visit, also marks termination
+  int endA, endB;
+  // double buffer, one is accessed by loader
+  // the other is accessed by consumer
+  // buffer of the data
+  std::vector<Elem> bufA, bufB;
+  // initialization end
+  bool init_end;
+  // singal whether the data is loaded
+  bool data_loaded;
+  // signal to kill the thread
+  bool destroy_signal;
+  // thread object
+  Thread loader_thread;
+  // signal of the buffer
+  Semaphore loading_end, loading_need;
+  /*!
+   * \brief slave thread
+   * this implementation is like producer-consumer style
+   */
+  inline void RunLoader(void) {
+    while(!destroy_signal) {
+      // sleep until loading is needed
+      loading_need.Wait();      
+      std::vector<Elem> &buf = current_buf ? bufB : bufA;
+      int i;
+      for (i = 0; i < buf_size ; ++i) {
+        if (!factory.LoadNext(buf[i])) {
+          int &end = current_buf ? endB : endA;
+          end = i; // marks the termination
+          break;
+        }
+      }
+      // signal that loading is done
+      data_loaded = true;
+      loading_end.Post();
+    }
+  }
+  /*!\brief entry point of loader thread */
+  inline static CXXNET_THREAD_PREFIX LoaderEntry(void *pthread) {
+    static_cast< ThreadBuffer<Elem,ElemFactory>* >(pthread)->RunLoader();
+    ThreadExit(NULL);
+    return NULL;
+  }
+  /*!\brief start loader thread */
+  inline void StartLoader(void) {
+    destroy_signal = false;
+    // set param
+    current_buf = 1;    
+    loading_need.Init(1);
+    loading_end .Init(0);
+    // reset terminate limit
+    endA = endB = buf_size;
+    loader_thread.Start(LoaderEntry, this);
+    // wait until first part of data is loaded
+    loading_end.Wait();
+    // set current buf to right value
+    current_buf = 0;
+    // wake loader for next part
+    data_loaded = false;
+    loading_need.Post();    
+    buf_index = 0; 
+  }
+  /*!\brief switch double buffer */
+  inline void SwitchBuffer(void) {
+    loading_end.Wait();
+    // loader shall be sleep now, critcal zone!
+    current_buf = !current_buf;
+    // wake up loader
+    data_loaded = false;
+    loading_need.Post();
+  }
+};
+}  // namespace utils
+}  // namespace cxxnet
+#endif

From 9cad774d5c46fd154700a2504c370f6aa74b32ac Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Fri, 4 Sep 2015 01:15:31 +0800
Subject: [PATCH 09/15] pass compilation, not tested

---
 Makefile                      |  23 +++-
 include/mxnet/io.h            |   2 +-
 make/config.mk                |   4 +-
 src/common/utils.h            |   5 +-
 src/io/image_augmenter.h      |  21 ++--
 src/io/inst_vector.h          |  11 +-
 src/io/io.cc                  |  14 ++-
 src/io/iter_batch.h           |  36 +++---
 src/io/iter_image_recordio.cc |  38 ++++---
 src/utils/decoder.h           |   2 +-
 src/utils/io.h                | 175 -----------------------------
 src/utils/thread_buffer.h     | 205 ----------------------------------
 tests/python/test_io.py       |  15 +++
 13 files changed, 106 insertions(+), 445 deletions(-)
 delete mode 100644 src/utils/io.h
 delete mode 100644 src/utils/thread_buffer.h

diff --git a/Makefile b/Makefile
index d758c443241e..5f9dcb83c3d7 100644
--- a/Makefile
+++ b/Makefile
@@ -13,10 +13,15 @@ ifndef DMLC_CORE
 endif
 
 
+ifneq ($(USE_OPENMP_ITER), 1)
+	export NO_OPENMP = 1
+endif
+
 # use customized config file
 include $(config)
 include mshadow/make/mshadow.mk
 include $(DMLC_CORE)/make/dmlc.mk
+unexport NO_OPENMP
 
 # all tge possible warning tread
 WARNFLAGS= -Wall
@@ -39,10 +44,21 @@ endif
 
 # setup opencv
 ifeq ($(USE_OPENCV),1)
-	CFLAGS+= -DCXXNET_USE_OPENCV=1
+	CFLAGS+= -DMXNET_USE_OPENCV=1
 	LDFLAGS+= `pkg-config --libs opencv`
 else
-	CFLAGS+= -DCXXNET_USE_OPENCV=0
+	CFLAGS+= -DMXNET_USE_OPENCV=0
+endif
+
+# setup opencv
+ifeq ($(USE_OPENCV_DECODER),1)
+	CFLAGS+= -DMXNET_USE_OPENCV_DECODER=1
+else
+	CFLAGS+= -DMXNET_USE_OPENCV_DECODER=0
+endif
+
+ifeq ($(USE_OPENMP_ITER), 1)
+	CFLAGS += -fopenmp
 endif
 
 ifeq ($(USE_CUDNN), 1)
@@ -62,7 +78,7 @@ endif
 ENGINE=naive_engine.o
 BIN = tests/test_simple_engine
 OBJ = narray_function_cpu.o
-OBJCXX11 = narray.o c_api.o operator.o symbol.o storage.o static_graph.o graph_executor.o io.o iter_mnist.o $(ENGINE)
+OBJCXX11 = narray.o c_api.o operator.o symbol.o storage.o static_graph.o graph_executor.o io.o iter_mnist.o iter_image_recordio.o $(ENGINE)
 CUOBJ = narray_function_gpu.o
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
@@ -92,6 +108,7 @@ operator.o: src/operator/operator.cc
 c_api.o: src/c_api.cc
 io.o: src/io/io.cc
 iter_mnist.o: src/io/iter_mnist.cc src/io/*.h
+iter_image_recordio.o: src/io/iter_image_recordio.cc
 
 # Rules for operators
 OPERATOR_HDR=$(wildcard src/operator/*-inl.h)
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 5a8267befc1c..7bb86f4eece3 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -121,7 +121,7 @@ struct DataIteratorReg
  */
 #define MXNET_REGISTER_IO_CHAINED_ITER(name, ChainedDataIterType, HoldingDataIterType)          \
   static ::mxnet::IIterator<DataBatch>* __create__ ## ChainedDataIteratorType ## __() { \
-    return new HoldingDataIteratorType(new ChainedDataIterType);                                    \
+    return new HoldingDataIterType(new ChainedDataIterType);                                    \
   }                                                                     \
   DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name) \
   .set_body(__create__ ## ChainedDataIteratorType ## __)
diff --git a/make/config.mk b/make/config.mk
index cd04b146180c..3e93e240e493 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -27,8 +27,8 @@ USE_CUDA_PATH = NONE
 # whether use opencv during compilation
 # you can disable it, however, you will not able to use
 # imbin iterator
-USE_OPENCV = 0
-USE_OPENCV_DECODER = 0
+USE_OPENCV = 1
+USE_OPENCV_DECODER = 1
 # whether use CUDNN R3 library
 USE_CUDNN = 0
 # add the path to CUDNN libary to link and compile flag
diff --git a/src/common/utils.h b/src/common/utils.h
index f7a2dcce0470..b5edb78bd6f9 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -21,12 +21,11 @@ namespace common {
  * \brief Random Engine
  */
 typedef std::mt19937 RANDOM_ENGINE;
-
 // Get a double float, prnd is the pointer to a Random Engine
 #define NextDouble(prnd) std::generate_canonical<float, 10>(*prnd) 
+// Get a random int in [0, range)
+#define NextUInt32(range, prnd) static_cast<uint32_t>(floor(std::generate_canonical<float, 10>(*prnd) * range))
 
-#define NextUInt32(range, prnd) static_cast<uint32_t>(\
-        floor(std::generate_canonical<float, 10>(*prnd) * range))
 /*!
  * \brief Helper functions.
  */
diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index d33464c4a889..3ca373d768b0 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -19,7 +19,6 @@ struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
   int crop_y_start_;
   /*! \brief whether we do nonrandom croping */
   int crop_x_start_;
-  /*! \brief Indicate the max ratation angle for augmentation, we will random rotate */
   /*! \brief [-max_rotate_angle, max_rotate_angle] */
   int max_rotate_angle_;
   /*! \brief max aspect ratio */
@@ -77,6 +76,7 @@ struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
         .describe("Rotate angle");
     DMLC_DECLARE_FIELD(fill_value_).set_default(255)
         .describe("Filled value while padding");
+  }
 };
 
 /*! \brief helper class to do image augmentation */
@@ -99,7 +99,7 @@ class ImageAugmenter {
                        << "input_shape must be three consecutive integers without space example: 1,1,200 ";
         }
         if (!strcmp(kwargs_left[i].first.c_str(), "rotate_list")) {
-          char* val = kwargs_left[i].second.c_str();
+          const char* val = kwargs_left[i].second.c_str();
           const char *end = val + strlen(val);
           char buf[128];
           while (val < end) {
@@ -121,9 +121,9 @@ class ImageAugmenter {
   virtual cv::Mat Process(const cv::Mat &src,
                           common::RANDOM_ENGINE *prnd) {
     // shear
-    float s = common::NextDouble(prnd) * param_.max_shear_ratio_ * 2 - param_.max_shear_ratio_;
+    float s = NextDouble(prnd) * param_.max_shear_ratio_ * 2 - param_.max_shear_ratio_;
     // rotate
-    int angle = common::NextUInt32(param_.max_rotate_angle_ * 2, prnd) - param_.max_rotate_angle_;
+    int angle = NextUInt32(param_.max_rotate_angle_ * 2, prnd) - param_.max_rotate_angle_;
     if (param_.rotate_ > 0) angle = param_.rotate_;
     if (rotate_list_.size() > 0) {
       angle = rotate_list_[NextUInt32(rotate_list_.size() - 1, prnd)];
@@ -160,7 +160,7 @@ class ImageAugmenter {
       mshadow::index_t rand_crop_size = NextUInt32(param_.max_crop_size_- param_.min_crop_size_+1, prnd)+ param_.min_crop_size_;
       mshadow::index_t y = res.rows - rand_crop_size;
       mshadow::index_t x = res.cols - rand_crop_size;
-      if (rand_crop_ != 0) {
+      if (param_.rand_crop_ != 0) {
         y = NextUInt32(y + 1, prnd);
         x = NextUInt32(x + 1, prnd);
       }
@@ -171,8 +171,8 @@ class ImageAugmenter {
       cv::resize(res(roi), res, cv::Size(shape_[1], shape_[2]));
     }
     else{
-      utils::Check(static_cast<mshadow::index_t>(res.cols) >= shape_[1] && static_cast<mshadow::index_t>(res.rows) >= shape_[2],
-        "input image size smaller than input shape");
+      CHECK(static_cast<mshadow::index_t>(res.cols) >= shape_[1] && static_cast<mshadow::index_t>(res.rows) >= shape_[2]) 
+          << "input image size smaller than input shape";
       mshadow::index_t y = res.rows - shape_[2];
       mshadow::index_t x = res.cols - shape_[1];
       if (param_.rand_crop_ != 0) {
@@ -240,9 +240,9 @@ class ImageAugmenter {
  private:
   // whether skip processing
   inline bool NeedProcess(void) const {
-    if (max_rotate_angle_ > 0 || max_shear_ratio_ > 0.0f
-        || rotate_ > 0 || rotate_list_.size() > 0) return true;
-    if (min_crop_size_ > 0 && max_crop_size_ > 0) return true;
+    if (param_.max_rotate_angle_ > 0 || param_.max_shear_ratio_ > 0.0f
+        || param_.rotate_ > 0 || rotate_list_.size() > 0) return true;
+    if (param_.min_crop_size_ > 0 && param_.max_crop_size_ > 0) return true;
     return false;
   }
   // temp input space
@@ -252,6 +252,7 @@ class ImageAugmenter {
   // rotation param
   cv::Mat rotateM;
   // parameters
+  ImageAugmentParam param_;
   /*! \brief input shape */
   mshadow::Shape<4> shape_;
   /*! \brief list of possible rotate angle */
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
index 9490ceab94c1..4ced7dd64c63 100644
--- a/src/io/inst_vector.h
+++ b/src/io/inst_vector.h
@@ -7,10 +7,11 @@
 #ifndef MXNET_INST_VECTOR_H_
 #define MXNET_INST_VECTOR_H_
 
-#include "./data.h"
-#include <vector>
+#include <mxnet/io.h>
+#include <mxnet/base.h>
 #include <dmlc/base.h>
 #include <mshadow/tensor.h>
+#include <vector>
 
 namespace mxnet {
 namespace io {
@@ -30,7 +31,7 @@ class TensorVector {
     CHECK(i + 1 < offset_.size());
     CHECK(shape_[i].Size() == offset_[i + 1] - offset_[i]);
     return mshadow::Tensor<cpu, dim, DType>
-        ((DType*)BeginPtr(content_) + offset_[i], shape_[i]);
+        ((DType*)dmlc::BeginPtr(content_) + offset_[i], shape_[i]);
   }
   inline mshadow::Tensor<cpu, dim, DType> Back() const {
     return (*this)[Size() - 1];
@@ -73,8 +74,8 @@ class InstVector {
   inline DataInst operator[](size_t i) const {
     DataInst inst;
     inst.index = index_[i];
-    inst.data = data_[i];
-    inst.label = label_[i];
+    inst.data.push_back(TBlob(data_[i]));
+    inst.data.push_back(TBlob(label_[i]));
     return inst;
   }
   // get back of instance vector
diff --git a/src/io/io.cc b/src/io/io.cc
index 9095f4089c92..b2dbc9f8c2c5 100644
--- a/src/io/io.cc
+++ b/src/io/io.cc
@@ -4,14 +4,18 @@
 
 #include <mxnet/io.h>
 #include <dmlc/registry.h>
-#include <image_augmenter.h>
-#include <>
-#include <iter_batch.h>
+#include "./image_augmenter.h"
+#include "./iter_batch.h"
 
 // Registers
 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::mxnet::DataIteratorReg);
+}  // namespace dmlc
+
+namespace mxnet {
+namespace io {
 // Register parameters in header files
 DMLC_REGISTER_PARAMETER(BatchParam);
-DMLC_REGISTER_PARAMETER(ImageAugmenterParam);
-}  // namespace dmlc
\ No newline at end of file
+DMLC_REGISTER_PARAMETER(ImageAugmentParam);
+}  // namespace mxnet
+}  // namespace io
diff --git a/src/io/iter_batch.h b/src/io/iter_batch.h
index a0e4ab7e7ba5..f258bc2d6afd 100644
--- a/src/io/iter_batch.h
+++ b/src/io/iter_batch.h
@@ -36,7 +36,7 @@ struct BatchParam : public dmlc::Parameter<BatchParam> {
     DMLC_DECLARE_FIELD(test_skipread_).set_default(false)
         .describe("Skip read for testing.");
     DMLC_DECLARE_FIELD(silent_).set_default(false)
-        .describe("Whether to print batch information.")
+        .describe("Whether to print batch information.");
   }
 };
     
@@ -48,7 +48,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
   }
   virtual ~BatchAdaptIter(void) {
     delete base_;
-    out_.FreeSpaceDense();
+    FreeSpaceDense();
   }
   virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
@@ -57,7 +57,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     for (size_t i = 0; i < kwargs_left.size(); i++) {
       if (!strcmp(kwargs_left[i].first.c_str(), "input_shape")) {
         CHECK(sscanf(kwargs_left[i].second.c_str(), "%u,%u,%u", &shape_[1], &shape_[2], &shape_[3]) == 3)
-          << "input_shape must be three consecutive integers without space example: 1,1,200 ")
+          << "input_shape must be three consecutive integers without space example: 1,1,200 ";
       }
     }
     // init base iterator
@@ -88,13 +88,13 @@ class BatchAdaptIter: public IIterator<DataBatch> {
 
     while (base_->Next()) {
       const DataInst& d = base_->Value();
-      mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 2, float>());
+      mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
       out_.inst_index[top] = d.index;
-      mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 4, float>());
+      mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
 
       if (++ top >= param_.batch_size_) {
-        out.data[0] = TBlob(data);
-        out.data[1] = TBlob(label);
+        out_.data[0] = TBlob(data);
+        out_.data[1] = TBlob(label);
         return true;
       }
     }
@@ -105,16 +105,16 @@ class BatchAdaptIter: public IIterator<DataBatch> {
         for (; top < param_.batch_size_; ++top, ++num_overflow_) {
           CHECK(base_->Next()) << "number of input must be bigger than batch size";
           const DataInst& d = base_->Value();
-          mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 2, float>());
+          mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
           out_.inst_index[top] = d.index;
-          mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 4, float>());
+          mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
         }
         out_.num_batch_padd = num_overflow_;
       } else {
-        out_.num_batch_padd = batch_size_ - top;
+        out_.num_batch_padd = param_.batch_size_ - top;
       }
-      out.data[0] = TBlob(data);
-      out.data[1] = TBlob(label);
+      out_.data[0] = TBlob(data);
+      out_.data[1] = TBlob(label);
       return true;
     }
     return false;
@@ -124,6 +124,8 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     return out_;
   }
 private:
+  /*! \brief batch parameters */
+  BatchParam param_;
   /*! \brief base iterator */
   IIterator<DataInst> *base_;
   /*! \brief input shape */
@@ -141,16 +143,16 @@ class BatchAdaptIter: public IIterator<DataBatch> {
   // Functions that allocate and free tensor space
   inline void AllocSpaceDense(bool pad = false) { 
     data = mshadow::NewTensor<mshadow::cpu>(shape_, 0.0f, pad);
-    mshadow::Shape<2> lshape = mshadow::Shape2(batch_size, label_width);
+    mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size_, param_.label_width_);
     label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
-    out_.inst_index = new unsigned[batch_size];
-    out_.batch_size = batch_size;
+    out_.inst_index = new unsigned[param_.batch_size_];
+    out_.batch_size = param_.batch_size_;
     out_.data.resize(2);
   }
   /*! \brief auxiliary function to free space, if needed, dense only */
   inline void FreeSpaceDense(void) {
     if (label.dptr_ != NULL) {
-      delete [] inst_index;
+      delete [] out_.inst_index;
       mshadow::FreeSpace(&label);
       mshadow::FreeSpace(&data);
       label.dptr_ = NULL;
@@ -159,4 +161,4 @@ class BatchAdaptIter: public IIterator<DataBatch> {
 }; // class BatchAdaptIter
 }  // namespace io
 }  // namespace cxxnet
-#endif  // MXNET_IO_ITER_BATCH_H_
\ No newline at end of file
+#endif  // MXNET_IO_ITER_BATCH_H_
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 2ab1aa8958cb..9977ddd2290c 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -16,6 +16,7 @@ iterator
 #include "./inst_vector.h"
 #include "./image_recordio.h"
 #include "./image_augmenter.h"
+#include "./iter_batch.h"
 #include "../utils/decoder.h"
 namespace mxnet {
 namespace io {
@@ -57,7 +58,7 @@ class ImageLabelMap {
     // be careful not to resize label_ afterwards
     idx2label_.reserve(image_index_.size());
     for (size_t i = 0; i < image_index_.size(); ++i) {
-      idx2label_[image_index_[i]] = BeginPtr(label_) + i * label_width_;
+      idx2label_[image_index_[i]] = dmlc::BeginPtr(label_) + i * label_width_;
     }
     if (!silent) {
       LOG(INFO) << "Loaded ImageList from " << path_imglist << ' '
@@ -101,7 +102,7 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
   DMLC_DECLARE_PARAMETER(ImageRecParserParam) {
     DMLC_DECLARE_FIELD(path_imglist_).set_default("")
         .describe("Path to image list.");
-    DMLC_DECLARE_FIELD(path_imagrec_).set_default("./data/imgrec.rec")
+    DMLC_DECLARE_FIELD(path_imgrec_).set_default("./data/imgrec.rec")
         .describe("Path to image record file.");
     DMLC_DECLARE_FIELD(nthread_).set_lower_bound(1).set_default(4)
         .describe("Number of thread to do parsing.");
@@ -178,7 +179,7 @@ inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, s
   // setup decoders
   for (int i = 0; i < threadget; ++i) {
     augmenters_.push_back(new ImageAugmenter());
-    augmenters_[i].init(kwargs_left);
+    augmenters_[i]->Init(kwargs_left);
     prnds_.push_back(new common::RANDOM_ENGINE((i + 1) * kRandMagic));
   }
   
@@ -186,16 +187,16 @@ inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, s
   // TODO, hack
   const char *ps_rank = getenv("PS_RANK");
   if (ps_rank != NULL) {
-    param_.dist_worker_rank = atoi(ps_rank);
+    param_.dist_worker_rank_ = atoi(ps_rank);
   }
 
   if (param_.path_imglist_.length() != 0) {
     label_map_ = new ImageLabelMap(param_.path_imglist_.c_str(),
-                                   param_.label_width_, silent_ != 0);
+                                   param_.label_width_, param_.silent_ != 0);
   } else {
     param_.label_width_ = 1;
   }
-  CHECK(path_imgrec_.length() != 0)
+  CHECK(param_.path_imgrec_.length() != 0)
     << "ImageRecordIOIterator: must specify image_rec";
 #if MSHADOW_DIST_PS
     // TODO move to a better place
@@ -221,8 +222,8 @@ ParseNext(std::vector<InstVector> *out_vec) {
   {
     CHECK(omp_get_num_threads() == param_.nthread_);
     int tid = omp_get_thread_num();
-    dmlc::RecordIOChunkReader reader(chunk, tid, parser_.nthread_);
-    mxnet::ImageRecordIO rec;
+    dmlc::RecordIOChunkReader reader(chunk, tid, param_.nthread_);
+    ImageRecordIO rec;
     dmlc::InputSplit::Blob blob;
     // image data
     InstVector &out = (*out_vec)[tid];
@@ -238,18 +239,21 @@ ParseNext(std::vector<InstVector> *out_vec) {
                mshadow::Shape3(3, res.rows, res.cols),
                mshadow::Shape1(param_.label_width_));
       DataInst inst = out.Back();
+      // turn datainst into tensor
+      mshadow::Tensor<mshadow::cpu, 3> data = inst.data[0].get<mshadow::cpu, 3, float>(); 
+      mshadow::Tensor<mshadow::cpu, 1> label = inst.data[1].get<mshadow::cpu, 1, float>(); 
       for (int i = 0; i < res.rows; ++i) {
         for (int j = 0; j < res.cols; ++j) {
           cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
-          inst.data[0][i][j] = bgr[2];
-          inst.data[1][i][j] = bgr[1];
-          inst.data[2][i][j] = bgr[0];
+          data[0][i][j] = bgr[2];
+          data[1][i][j] = bgr[1];
+          data[2][i][j] = bgr[0];
         }
       }
       if (label_map_ != NULL) {
-        mshadow::Copy(inst.label, label_map_->Find(rec.image_index()));
+        mshadow::Copy(label, label_map_->Find(rec.image_index()));
       } else {
-        inst.label[0] = rec.header.label;
+        label[0] = rec.header.label;
       }
       res.release();
     }
@@ -324,7 +328,7 @@ class ImageRecordIter : public IIterator<DataInst> {
         }
         // shuffle instance order if needed
         if (shuffle_ != 0) {
-            std::shuffle(inst_order_.begin(), inst_.end(), common::RANDOM_ENGINE(kRandMagic + param_.seed));
+            std::shuffle(inst_order_.begin(), inst_order_.end(), common::RANDOM_ENGINE(kRandMagic + param_.seed));
         }
         inst_ptr_ = 0;
       }
@@ -353,17 +357,15 @@ class ImageRecordIter : public IIterator<DataInst> {
   // backend thread
   dmlc::ThreadedIter<std::vector<InstVector> > iter_;
   // parameters
-  ImageRecParserParam param_;
+  ImageRecordParam param_;
 };
 DMLC_REGISTER_PARAMETER(ImageRecParserParam);
 DMLC_REGISTER_PARAMETER(ImageRecordParam);
-MXNET_REGISTER_IO_ITER(MNISTIter, MNISTIter)
 MXNET_REGISTER_IO_CHAINED_ITER(ImageRecordIter, ImageRecordIter, BatchAdaptIter)
     .describe("Create iterator for dataset packed in recordio.")
     .add_arguments(ImageRecordParam::__FIELDS__())
     .add_arguments(ImageRecParserParam::__FIELDS__())
     .add_arguments(BatchParam::__FIELDS__())
-    .add_arguments(ImageAugmenterParam::__FIELDS__());
+    .add_arguments(ImageAugmentParam::__FIELDS__());
 }  // namespace io
 }  // namespace mxnet
-#endif  // ITER_IMAGE_RECORDIO_INL_HPP_
diff --git a/src/utils/decoder.h b/src/utils/decoder.h
index 17203392cc60..52db01edee23 100644
--- a/src/utils/decoder.h
+++ b/src/utils/decoder.h
@@ -13,7 +13,7 @@
   #include <opencv2/opencv.hpp>
 #endif
 
-namespace cxxnet {
+namespace mxnet {
 namespace utils {
 
 #if MXNET_USE_OPENCV_DECODER == 0
diff --git a/src/utils/io.h b/src/utils/io.h
deleted file mode 100644
index 3781ce98b012..000000000000
--- a/src/utils/io.h
+++ /dev/null
@@ -1,175 +0,0 @@
-#ifndef CXXNET_UTILS_IO_H_
-#define CXXNET_UTILS_IO_H_
-/*!
- * \file io.h
- * \brief definition of abstract stream interface for IO
- * \author Bing Xu Tianqi Chen
- */
-#include "./utils.h"
-#include <dmlc/io.h>
-#include <string>
-#include <algorithm>
-#include <cstring>
-
-namespace cxxnet {
-namespace utils {
-typedef dmlc::Stream IStream;
-typedef dmlc::SeekStream ISeekStream;
-
-/*! \brief a in memory buffer that can be read and write as stream interface */
-struct MemoryBufferStream : public ISeekStream {
- public:
-  MemoryBufferStream(std::string *p_buffer)
-      : p_buffer_(p_buffer) {
-    curr_ptr_ = 0;
-  }
-  virtual ~MemoryBufferStream(void) {}
-  virtual size_t Read(void *ptr, size_t size) {
-    CHECK(curr_ptr_ <= p_buffer_->length())
-          << " read can not have position excceed buffer length";
-    size_t nread = std::min(p_buffer_->length() - curr_ptr_, size);
-    if (nread != 0) memcpy(ptr, &(*p_buffer_)[0] + curr_ptr_, nread);
-    curr_ptr_ += nread;
-    return nread;
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    if (size == 0) return;
-    if (curr_ptr_ + size > p_buffer_->length()) {
-      p_buffer_->resize(curr_ptr_+size);
-    }
-    memcpy(&(*p_buffer_)[0] + curr_ptr_, ptr, size);
-    curr_ptr_ += size;
-  }
-  virtual void Seek(size_t pos) {
-    curr_ptr_ = static_cast<size_t>(pos);
-  }
-  virtual size_t Tell(void) {
-    return curr_ptr_;
-  }
-
- private:
-  /*! \brief in memory buffer */
-  std::string *p_buffer_;
-  /*! \brief current pointer */
-  size_t curr_ptr_;
-}; // class MemoryBufferStream
-
-/*! \brief implementation of file i/o stream */
-class StdFile: public ISeekStream {
- public:
-  /*! \brief constructor */
-  StdFile(const char *fname, const char *mode) {
-    Open(fname, mode);
-  }
-  StdFile() {}
-  virtual ~StdFile(void) {
-    this->Close();
-  }
-  virtual void Open(const char *fname, const char *mode) {
-    fp_ = utils::FopenCheck(fname, mode);
-    fseek(fp_, 0L, SEEK_END);
-    sz_ = ftell(fp_);
-    fseek(fp_, 0L, SEEK_SET);
-  }
-  virtual size_t Read(void *ptr, size_t size) {
-    return fread(ptr, size, 1, fp_);
-  }
-  virtual void Write(const void *ptr, size_t size) {
-    fwrite(ptr, size, 1, fp_);
-  }
-  virtual void Seek(size_t pos) {
-    fseek(fp_, pos, SEEK_SET);
-  }
-  virtual size_t Tell(void) {
-    return static_cast<size_t>(ftell(fp_));
-  }
-  inline void Close(void) {
-    if (fp_ != NULL){
-      fclose(fp_); fp_ = NULL;
-    }
-  }
-  inline size_t Size() {
-    return sz_;
-  }
- private:
-  FILE *fp_;
-  size_t sz_;
-}; // class StdFile
-
-/*! \brief Basic page class */
-class BinaryPage {
- public:
-  /*! \brief page size 64 MB */
-  static const size_t kPageSize = 64 << 18;
- public:
-  /*! \brief memory data object */
-  struct Obj{
-    /*! \brief pointer to the data*/
-    void  *dptr;
-    /*! \brief size */
-    size_t sz;
-    Obj(void * dptr, size_t sz) : dptr(dptr), sz(sz){}
-  };
- public:
-  /*! \brief constructor of page */
-  BinaryPage(void)  {
-    data_ = new int[kPageSize];
-    utils::Check(data_ != NULL, "fail to allocate page, out of space");
-    this->Clear();
-  };
-  ~BinaryPage() {
-    if (data_) delete [] data_;
-  }
-  /*!
-   * \brief load one page form instream
-   * \return true if loading is successful
-   */
-  inline bool Load(utils::IStream &fi) {
-    return fi.Read(&data_[0], sizeof(int)*kPageSize) !=0;
-  }
-  /*! \brief save one page into outstream */
-  inline void Save(utils::IStream &fo) {
-    fo.Write(&data_[0], sizeof(int)*kPageSize);
-  }
-  /*! \return number of elements */
-  inline int Size(void){
-    return data_[0];
-  }
-  /*! \brief Push one binary object into page
-   *  \param fname file name of obj need to be pushed into
-   *  \return false or true to push into
-   */
-  inline bool Push(const Obj &dat) {
-    if(this->FreeBytes() < dat.sz + sizeof(int)) return false;
-    data_[ Size() + 2 ] = data_[ Size() + 1 ] + dat.sz;
-    memcpy(this->offset(data_[ Size() + 2 ]), dat.dptr, dat.sz);
-    ++ data_[0];
-    return true;
-  }
-  /*! \brief Clear the page */
-  inline void Clear(void) {
-    memset(&data_[0], 0, sizeof(int) * kPageSize);
-  }
-  /*!
-   * \brief Get one binary object from page
-   *  \param r r th obj in the page
-   */
-  inline Obj operator[](int r) {
-    CHECK(r < Size());
-    return Obj(this->offset(data_[ r + 2 ]),  data_[ r + 2 ] - data_[ r + 1 ]);
-  }
- private:
-  /*! \return number of elements */
-  inline size_t FreeBytes(void) {
-    return (kPageSize - (Size() + 2)) * sizeof(int) - data_[ Size() + 1 ];
-  }
-  inline void* offset(int pos) {
-    return (char*)(&data_[0]) + (kPageSize*sizeof(int) - pos);
-  }
- private:
-  //int data_[ kPageSize ];
-  int *data_;
-};  // class BinaryPage
-}  // namespace utils
-}  // namespace cxxnet
-#endif
diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h
deleted file mode 100644
index 7df1ae17aa56..000000000000
--- a/src/utils/thread_buffer.h
+++ /dev/null
@@ -1,205 +0,0 @@
-#ifndef CXXNET_UTILS_THREAD_BUFFER_H_
-#define CXXNET_UTILS_THREAD_BUFFER_H_
-/*!
- * \file thread_buffer.h
- * \brief  multi-thread buffer, iterator, can be used to create parallel pipeline
- * \author Tianqi Chen
- */
-#include <vector>
-#include <cstring>
-#include <cstdlib>
-#include "./utils.h"
-#include "./thread.h"
-namespace cxxnet {
-namespace utils {
-/*!
- * \brief buffered loading iterator that uses multithread
- * this template method will assume the following paramters
- * \tparam Elem elememt type to be buffered
- * \tparam ElemFactory factory type to implement in order to use thread buffer
- */
-template<typename Elem, typename ElemFactory>
-class ThreadBuffer {
- public:
-  /*!\brief constructor */
-  ThreadBuffer(void) {
-    this->init_end = false;
-    this->buf_size = 30;
-  }
-  ~ThreadBuffer(void) {
-    if(init_end) this->Destroy();
-  }
-  /*!\brief set parameter, will also pass the parameter to factory */
-  inline void SetParam(const char *name, const char *val) {
-    if (!strcmp( name, "buffer_size")) buf_size = atoi(val);
-    factory.SetParam(name, val);
-  }
-  /*!
-   * \brief initalize the buffered iterator
-   * \param param a initialize parameter that will pass to factory, ignore it if not necessary
-   * \return false if the initlization can't be done, e.g. buffer file hasn't been created 
-   */
-  inline bool Init(void) {
-    if (!factory.Init()) return false;
-    bufA.reserve(buf_size);
-    bufB.reserve(buf_size);
-    for (int i = 0; i < buf_size; ++i) {
-      bufA.push_back(factory.Create());
-      bufB.push_back(factory.Create());
-    }
-    this->init_end = true;
-    this->StartLoader();
-    return true;
-  }  
-  /*!\brief place the iterator before first value */
-  inline void BeforeFirst(void) {
-    // wait till last loader end
-    loading_end.Wait();
-    // critcal zone
-    current_buf = 1;
-    factory.BeforeFirst();
-    // reset terminate limit
-    endA = endB = buf_size;
-    // wake up loader for first part
-    loading_need.Post();
-    // wait til first part is loaded
-    loading_end.Wait();
-    // set current buf to right value
-    current_buf = 0;
-    // wake loader for next part
-    data_loaded = false;
-    loading_need.Post();
-    // set buffer value
-    buf_index = 0;
-  }  
-  /*! \brief destroy the buffer iterator, will deallocate the buffer */
-  inline void Destroy(void) {
-    // wait until the signal is consumed
-    this->destroy_signal = true;
-    loading_need.Post();
-    loader_thread.Join();
-    loading_need.Destroy();
-    loading_end.Destroy();    
-    for (size_t i = 0; i < bufA.size(); ++i) {
-      factory.FreeSpace(bufA[i]);
-    }
-    for (size_t i = 0; i < bufB.size(); ++i) {
-      factory.FreeSpace(bufB[i]);
-    }
-    bufA.clear(); bufB.clear();
-    factory.Destroy();
-    this->init_end = false;
-  }  
-  /*!
-   * \brief get the next element needed in buffer
-   * \param elem element to store into
-   * \return whether reaches end of data
-   */
-  inline bool Next(Elem &elem) {
-    // end of buffer try to switch
-    if (buf_index == buf_size) {
-      this->SwitchBuffer();
-      buf_index = 0;
-    }
-    if (buf_index >= (current_buf ? endA : endB)) { 
-      return false;
-    }
-    std::vector<Elem> &buf = current_buf ? bufA : bufB;
-    elem = buf[buf_index];
-    ++buf_index;
-    return true;
-  }      
-  /*!
-   * \brief get the factory object
-   */
-  inline ElemFactory &get_factory(void) {
-    return factory;
-  }
-  inline const ElemFactory &get_factory(void) const{
-    return factory;
-  }
-  // size of buffer
-  int  buf_size;
- private:
-  // factory object used to load configures
-  ElemFactory factory;
-  // index in current buffer
-  int buf_index;
-  // indicate which one is current buffer
-  int current_buf;
-  // max limit of visit, also marks termination
-  int endA, endB;
-  // double buffer, one is accessed by loader
-  // the other is accessed by consumer
-  // buffer of the data
-  std::vector<Elem> bufA, bufB;
-  // initialization end
-  bool init_end;
-  // singal whether the data is loaded
-  bool data_loaded;
-  // signal to kill the thread
-  bool destroy_signal;
-  // thread object
-  Thread loader_thread;
-  // signal of the buffer
-  Semaphore loading_end, loading_need;
-  /*!
-   * \brief slave thread
-   * this implementation is like producer-consumer style
-   */
-  inline void RunLoader(void) {
-    while(!destroy_signal) {
-      // sleep until loading is needed
-      loading_need.Wait();      
-      std::vector<Elem> &buf = current_buf ? bufB : bufA;
-      int i;
-      for (i = 0; i < buf_size ; ++i) {
-        if (!factory.LoadNext(buf[i])) {
-          int &end = current_buf ? endB : endA;
-          end = i; // marks the termination
-          break;
-        }
-      }
-      // signal that loading is done
-      data_loaded = true;
-      loading_end.Post();
-    }
-  }
-  /*!\brief entry point of loader thread */
-  inline static CXXNET_THREAD_PREFIX LoaderEntry(void *pthread) {
-    static_cast< ThreadBuffer<Elem,ElemFactory>* >(pthread)->RunLoader();
-    ThreadExit(NULL);
-    return NULL;
-  }
-  /*!\brief start loader thread */
-  inline void StartLoader(void) {
-    destroy_signal = false;
-    // set param
-    current_buf = 1;    
-    loading_need.Init(1);
-    loading_end .Init(0);
-    // reset terminate limit
-    endA = endB = buf_size;
-    loader_thread.Start(LoaderEntry, this);
-    // wait until first part of data is loaded
-    loading_end.Wait();
-    // set current buf to right value
-    current_buf = 0;
-    // wake loader for next part
-    data_loaded = false;
-    loading_need.Post();    
-    buf_index = 0; 
-  }
-  /*!\brief switch double buffer */
-  inline void SwitchBuffer(void) {
-    loading_end.Wait();
-    // loader shall be sleep now, critcal zone!
-    current_buf = !current_buf;
-    // wake up loader
-    data_loaded = false;
-    loading_need.Post();
-  }
-};
-}  // namespace utils
-}  // namespace cxxnet
-#endif
diff --git a/tests/python/test_io.py b/tests/python/test_io.py
index dfeb3f67c293..991a4813033e 100644
--- a/tests/python/test_io.py
+++ b/tests/python/test_io.py
@@ -39,3 +39,18 @@ def test_MNISTIter_reset():
     label_1 = train_dataiter.getlabel().numpy.flatten()
     assert(sum(label_0 - label_1) == 0)
 
+def test_ImageRecIter():
+    dataiter = mx.io.ImageRecordIter(path_imgrec="data/val_cxxnet.rec",
+            image_mean="data/val_cxxnet_mean.bin",
+            rand_crop=True,
+            rand_mirror=True,
+            input_shape="3,224,224",
+            batch_size=128)
+
+
+
+
+
+
+
+

From 2566b3e9dcb90e9322a5f9cfa24c27ffba1d0c29 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Sun, 6 Sep 2015 07:47:04 +0800
Subject: [PATCH 10/15] merge augmenter, modify param attribute

---
 src/io/image_augmenter.h      | 279 +++++++++++++++++++++++++---------
 src/io/iter_batch.h           |  55 ++++---
 src/io/iter_image_recordio.cc |  76 ++++-----
 3 files changed, 275 insertions(+), 135 deletions(-)

diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index 3ca373d768b0..a81e5297d5b3 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -1,7 +1,7 @@
 /*!
  * \file image_augmenter_opencv.hpp
  * \brief threaded version of page iterator
- * \author Naiyan Wang, Tianqi Chen
+ * \author Naiyan Wang, Tianqi Chen, Tianjun Xiao
  */
 #ifndef MXNET_IO_IMAGE_AUGMENTER_H_
 #define MXNET_IO_IMAGE_AUGMENTER_H_
@@ -14,68 +14,102 @@ namespace io {
 /*! \brief image augmentation parameters*/
 struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
   /*! \brief whether we do random cropping */
-  bool rand_crop_;
+  bool rand_crop;
   /*! \brief whether we do nonrandom croping */
-  int crop_y_start_;
+  int crop_y_start;
   /*! \brief whether we do nonrandom croping */
-  int crop_x_start_;
+  int crop_x_start;
   /*! \brief [-max_rotate_angle, max_rotate_angle] */
-  int max_rotate_angle_;
+  int max_rotate_angle;
   /*! \brief max aspect ratio */
-  float max_aspect_ratio_;
+  float max_aspect_ratio;
   /*! \brief random shear the image [-max_shear_ratio, max_shear_ratio] */
-  float max_shear_ratio_;
+  float max_shear_ratio;
   /*! \brief max crop size */
-  int max_crop_size_;
+  int max_crop_size;
   /*! \brief min crop size */
-  int min_crop_size_;
+  int min_crop_size;
   /*! \brief max scale ratio */
-  float max_random_scale_;
+  float max_random_scale;
   /*! \brief min scale_ratio */
-  float min_random_scale_;
+  float min_random_scale;
   /*! \brief min image size */
-  float min_img_size_;
+  float min_img_size;
   /*! \brief max image size */
-  float max_img_size_;
-  /*! \brief whether to mirror the image */
-  bool mirror_;
+  float max_img_size;
   /*! \brief rotate angle */
-  int rotate_;
+  int rotate;
   /*! \brief filled color while padding */
-  int fill_value_;
+  int fill_value;
+  /*! \brief whether to mirror the image */
+  bool mirror;
+  /*! \brief whether to perform rand mirror the image */
+  bool rand_mirror;
+  /*! \brief mean file string*/
+  std::string mean_img;
+  /*! \brief mean value for r channel */
+  float mean_r;
+  /*! \brief mean value for g channel */
+  float mean_g;
+  /*! \brief mean value for b channel */
+  float mean_b;
+  /*! \brief shape of the image data*/
+  TShape input_shape;
+  /*! \brief maximum ratio of contrast variation */
+  float max_random_contrast_;
+  /*! \brief maximum value of illumination variation */
+  float max_random_illumination_;
   // declare parameters
   // TODO: didn't understand the range for some params
   DMLC_DECLARE_PARAMETER(ImageAugmentParam) {
     DMLC_DECLARE_FIELD(rand_crop_).set_default(true)
         .describe("Whether we de random cropping");
-    DMLC_DECLARE_FIELD(crop_y_start_).set_default(-1)
+    DMLC_DECLARE_FIELD(crop_y_start).set_default(-1)
         .describe("Where to nonrandom crop on y");
-    DMLC_DECLARE_FIELD(crop_x_start_).set_default(-1)
+    DMLC_DECLARE_FIELD(crop_x_start).set_default(-1)
         .describe("Where to nonrandom crop on x");
-    DMLC_DECLARE_FIELD(max_rotate_angle_).set_default(0.0f)
+    DMLC_DECLARE_FIELD(max_rotate_angle).set_default(0.0f)
         .describe("Rotate can be [-max_rotate_angle, max_rotate_angle]");
-    DMLC_DECLARE_FIELD(max_aspect_ratio_).set_default(0.0f)
+    DMLC_DECLARE_FIELD(max_aspect_ratio).set_default(0.0f)
         .describe("Max aspect ratio");
-    DMLC_DECLARE_FIELD(max_shear_ratio_).set_default(0.0f)
+    DMLC_DECLARE_FIELD(max_shear_ratio).set_default(0.0f)
         .describe("Shear rotate can be made between [-max_shear_ratio_, max_shear_ratio_]");
-    DMLC_DECLARE_FIELD(max_crop_size_).set_default(-1)
+    DMLC_DECLARE_FIELD(max_crop_size).set_default(-1)
         .describe("Maximum crop size");
-    DMLC_DECLARE_FIELD(min_crop_size_).set_default(-1)
+    DMLC_DECLARE_FIELD(min_crop_size).set_default(-1)
         .describe("Minimum crop size");
-    DMLC_DECLARE_FIELD(max_random_scale_).set_default(1.0f)
+    DMLC_DECLARE_FIELD(max_random_scale).set_default(1.0f)
         .describe("Maxmum scale ratio");
-    DMLC_DECLARE_FIELD(min_random_scale_).set_default(1.0f)
+    DMLC_DECLARE_FIELD(min_random_scale).set_default(1.0f)
         .describe("Minimum scale ratio");       
-    DMLC_DECLARE_FIELD(max_img_size_).set_default(1e10f)
+    DMLC_DECLARE_FIELD(max_img_size).set_default(1e10f)
         .describe("Maxmum image size");
-    DMLC_DECLARE_FIELD(min_img_size_).set_default(0.0f)
+    DMLC_DECLARE_FIELD(min_img_size).set_default(0.0f)
         .describe("Minimum image size");
-    DMLC_DECLARE_FIELD(mirror_).set_default(false)
-        .describe("Whether to mirror the image");
-    DMLC_DECLARE_FIELD(rotate_).set_default(-1.0f)
+    DMLC_DECLARE_FIELD(rotate).set_default(-1.0f)
         .describe("Rotate angle");
-    DMLC_DECLARE_FIELD(fill_value_).set_default(255)
+    DMLC_DECLARE_FIELD(fill_value).set_default(255)
         .describe("Filled value while padding");
+    DMLC_DECLARE_FIELD(mirror).set_default(false)
+        .describe("Whether to mirror the image");
+    DMLC_DECLARE_FIELD(rand_mirror).set_default(false)
+        .describe("Whether to mirror the image randomly");
+    DMLC_DECLARE_FIELD(mean_img).set_default("")
+        .describe("Mean Image to be subtracted");
+    DMLC_DECLARE_FIELD(mean_r).set_default(0.0f)
+        .describe("Mean value on R channel");
+    DMLC_DECLARE_FIELD(mean_g).set_default(0.0f)
+        .describe("Mean value on G channel");   
+    DMLC_DECLARE_FIELD(mean_b).set_default(0.0f)
+        .describe("Mean value on B channel");
+    float input_shape_default = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape).set_default(TShape(input_shape_default, input_shape_default + 3))
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("Input shape of the neural net");
+    DMLC_DECLARE_FIELD(max_random_contrast).set_default(0.0f)
+        .describe("Maximum ratio of contrast variation");
+    DMLC_DECLARE_FIELD(max_random_illumination).set_default(0.0f)
+        .describe("Maximum value of illumination variation");
   }
 };
 
@@ -84,8 +118,8 @@ class ImageAugmenter {
  public:
   // contructor
   ImageAugmenter(void)
-      : tmpres(false),
-        rotateM(2, 3, CV_32F) {
+      : tmpres_(false),
+        rotateM_(2, 3, CV_32F) {
   }
   virtual ~ImageAugmenter() {
   }
@@ -94,10 +128,6 @@ class ImageAugmenter {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     kwargs_left = param_.InitAllowUnknown(kwargs);
     for (size_t i = 0; i < kwargs_left.size(); i++) {
-        if (!strcmp(kwargs_left[i].first.c_str(), "input_shape")) {
-          CHECK(sscanf(kwargs_left[i].second.c_str(), "%u,%u,%u", &shape_[0], &shape_[1], &shape_[2]) == 3)
-                       << "input_shape must be three consecutive integers without space example: 1,1,200 ";
-        }
         if (!strcmp(kwargs_left[i].first.c_str(), "rotate_list")) {
           const char* val = kwargs_left[i].second.c_str();
           const char *end = val + strlen(val);
@@ -109,6 +139,19 @@ class ImageAugmenter {
           }
         }
     }
+    if (param_.mean_img.length() != 0) {
+      dmlc::Stream *fi = dmlc::Stream::Create(param_.mean_img.c_str(), "r", true);
+      if (fi == NULL) {
+        this->CreateMeanImg();
+      } else {
+        if (param_.silent == 0) {
+          printf("loading mean image from %s\n", param_.mean_img.c_str());
+        }
+        meanimg_.LoadBinary(*fi);
+        delete fi;
+        meanfile_ready_ = true;
+      }
+    }
   }
   /*!
    * \brief augment src image, store result into dst
@@ -118,27 +161,27 @@ class ImageAugmenter {
    * \param source of random number
    * \param dst the pointer to the place where we want to store the result
    */
-  virtual cv::Mat Process(const cv::Mat &src,
+  virtual cv::Mat OpencvProcess(const cv::Mat &src,
                           common::RANDOM_ENGINE *prnd) {
     // shear
-    float s = NextDouble(prnd) * param_.max_shear_ratio_ * 2 - param_.max_shear_ratio_;
+    float s = NextDouble(prnd) * param_.max_shear_ratio * 2 - param_.max_shear_ratio;
     // rotate
-    int angle = NextUInt32(param_.max_rotate_angle_ * 2, prnd) - param_.max_rotate_angle_;
-    if (param_.rotate_ > 0) angle = param_.rotate_;
+    int angle = NextUInt32(param_.max_rotate_angle * 2, prnd) - param_.max_rotate_angle;
+    if (param_.rotate > 0) angle = param_.rotate;
     if (rotate_list_.size() > 0) {
       angle = rotate_list_[NextUInt32(rotate_list_.size() - 1, prnd)];
     }
     float a = cos(angle / 180.0 * M_PI);
     float b = sin(angle / 180.0 * M_PI);
     // scale
-    float scale = NextDouble(prnd) * (param_.max_random_scale_ - param_.min_random_scale_) + param_.min_random_scale_;
+    float scale = NextDouble(prnd) * (param_.max_random_scale - param_.min_random_scale) + param_.min_random_scale;
     // aspect ratio
-    float ratio = NextDouble(prnd) * param_.max_aspect_ratio_ * 2 - param_.max_aspect_ratio_ + 1;
+    float ratio = NextDouble(prnd) * param_.max_aspect_ratio * 2 - param_.max_aspect_ratio + 1;
     float hs = 2 * scale / (1 + ratio);
     float ws = ratio * hs;
     // new width and height
-    float new_width = std::max(param_.min_img_size_, std::min(param_.max_img_size_, scale * src.cols));
-    float new_height = std::max(param_.min_img_size_, std::min(param_.max_img_size_, scale * src.rows));
+    float new_width = std::max(param_.min_img_size, std::min(param_.max_img_size, scale * src.cols));
+    float new_height = std::max(param_.min_img_size, std::min(param_.max_img_size, scale * src.rows));
     //printf("%f %f %f %f %f %f %f %f %f\n", s, a, b, scale, ratio, hs, ws, new_width, new_height);
     cv::Mat M(2, 3, CV_32F);
     M.at<float>(0, 0) = hs * a - s * b * ws;
@@ -152,15 +195,16 @@ class ImageAugmenter {
     cv::warpAffine(src, temp, M, cv::Size(new_width, new_height),
                      cv::INTER_LINEAR,
                      cv::BORDER_CONSTANT,
-                     cv::Scalar(param_.fill_value_, param_.fill_value_, param_.fill_value_));
+                     cv::Scalar(param_.fill_value, param_.fill_value, param_.fill_value));
     cv::Mat res = temp;
-    if (param_.max_crop_size_ != -1 || param_.min_crop_size_ != -1){
-      CHECK(res.cols >= param_.max_crop_size_ && res.rows >= param_.max_crop_size_&& param_.max_crop_size_ >= param_.min_crop_size_)
+    // crop
+    if (param_.max_crop_size != -1 || param_.min_crop_size != -1){
+      CHECK(res.cols >= param_.max_crop_size && res.rows >= param_.max_crop_size && param_.max_crop_size >= param_.min_crop_size)
           << "input image size smaller than max_crop_size";
-      mshadow::index_t rand_crop_size = NextUInt32(param_.max_crop_size_- param_.min_crop_size_+1, prnd)+ param_.min_crop_size_;
+      mshadow::index_t rand_crop_size = NextUInt32(param_.max_crop_size- param_.min_crop_size+1, prnd)+ param_.min_crop_size;
       mshadow::index_t y = res.rows - rand_crop_size;
       mshadow::index_t x = res.cols - rand_crop_size;
-      if (param_.rand_crop_ != 0) {
+      if (param_.rand_crop != 0) {
         y = NextUInt32(y + 1, prnd);
         x = NextUInt32(x + 1, prnd);
       }
@@ -168,13 +212,13 @@ class ImageAugmenter {
         y /= 2; x /= 2;
       }
       cv::Rect roi(x, y, rand_crop_size, rand_crop_size);
-      cv::resize(res(roi), res, cv::Size(shape_[1], shape_[2]));
+      cv::resize(res(roi), res, cv::Size(param_.input_shape[1], param_.input_shape[2]));
     }
     else{
-      CHECK(static_cast<mshadow::index_t>(res.cols) >= shape_[1] && static_cast<mshadow::index_t>(res.rows) >= shape_[2]) 
+      CHECK(static_cast<mshadow::index_t>(res.cols) >= param_.input_shape[1] && static_cast<mshadow::index_t>(res.rows) >= param_.input_shape[2]) 
           << "input image size smaller than input shape";
-      mshadow::index_t y = res.rows - shape_[2];
-      mshadow::index_t x = res.cols - shape_[1];
+      mshadow::index_t y = res.rows - param_.input_shape[2];
+      mshadow::index_t x = res.cols - param_.input_shape[1];
       if (param_.rand_crop_ != 0) {
         y = NextUInt32(y + 1, prnd);
         x = NextUInt32(x + 1, prnd);
@@ -182,7 +226,7 @@ class ImageAugmenter {
       else {
         y /= 2; x /= 2;
       }
-      cv::Rect roi(x, y, shape_[1], shape_[2]);
+      cv::Rect roi(x, y, param_.input_shape[1], param_.input_shape[2]);
       res = res(roi);
     }
     return res;
@@ -195,9 +239,9 @@ class ImageAugmenter {
    * \param source of random number
    * \param dst the pointer to the place where we want to store the result
    */
-  virtual mshadow::Tensor<cpu, 3> Process(mshadow::Tensor<cpu, 3> data,
+  virtual mshadow::Tensor<cpu, 3> OpencvProcess(mshadow::Tensor<cpu, 3> data,
                                           common::RANDOM_ENGINE *prnd) {
-    if (!NeedProcess()) return data;
+    if (!NeedOpencvProcess()) return data;
     cv::Mat res(data.size(1), data.size(2), CV_8UC3);
     for (index_t i = 0; i < data.size(1); ++i) {
       for (index_t j = 0; j < data.size(2); ++j) {
@@ -206,7 +250,7 @@ class ImageAugmenter {
         res.at<cv::Vec3b>(i, j)[2] = data[0][i][j];
       }
     }
-    res = this->Process(res, prnd);
+    res = this->OpencvProcess(res, prnd);
     tmpres.Resize(mshadow::Shape3(3, res.rows, res.cols));
     for (index_t i = 0; i < tmpres.size(1); ++i) {
       for (index_t j = 0; j < tmpres.size(2); ++j) {
@@ -219,12 +263,12 @@ class ImageAugmenter {
     return tmpres;
   }
 
-  virtual void Process(unsigned char *dptr, size_t sz,
+  virtual void OpencvProcess(unsigned char *dptr, size_t sz,
                        mshadow::TensorContainer<cpu, 3> *p_data,
                        common::RANDOM_ENGINE *prnd) {
     cv::Mat buf(1, sz, CV_8U, dptr);
     cv::Mat res = cv::imdecode(buf, 1);
-    res = this->Process(res, prnd);
+    res = this->OpencvProcess(res, prnd);
     p_data->Resize(mshadow::Shape3(3, res.rows, res.cols));
     for (index_t i = 0; i < p_data->size(1); ++i) {
       for (index_t j = 0; j < p_data->size(2); ++j) {
@@ -237,20 +281,117 @@ class ImageAugmenter {
     res.release();
   }
 
+  void TensorProcess(mshadow::TensorContainer<cpu, 3> *p_data,
+                       common::RANDOM_ENGINE *prnd) {
+    img_.Resize(mshadow::Shape3((*p_data).shape_[0], param_.input_shape[1], param_.input_shape[2]));
+    if (param_.input_shape[1] == 1) {
+      img_ = (*p_data) * param_.scale;
+    } else {
+      CHECK(p_data->size(1) >= param_.input_shape[1] && p_data->size(2) >= param_.input_shape[2])
+          << "Data size must be bigger than the input size to net.";
+      mshadow::index_t yy = p_data->size(1) - param_.input_shape[1];
+      mshadow::index_t xx = p_data->size(2) - param_.input_shape[2];
+      if (param_.rand_crop != 0 && (yy != 0 || xx != 0)) {
+        yy = NextUInt32(yy + 1, prnd);
+        xx = NextUInt32(xx + 1, prnd);
+      } else {
+        yy /= 2; xx /= 2;
+      }
+      if (p_data->size(1) != param_.input_shape[1] && param_.crop_y_start != -1) {
+        yy = param_.crop_y_start;
+      }
+      if (p_data->size(2) != param_.input_shape[2] && param_.crop_x_start != -1) {
+        xx = param_.crop_x_start;
+      }
+      float contrast = NextDouble(prnd) * param_.max_random_contrast * 2 - param_.max_random_contrast + 1;
+      float illumination = NextDouble(prnd) * param_.max_random_illumination * 2 - param_.max_random_illumination;
+      if (param_.mean_r > 0.0f || param_.mean_g > 0.0f || param_.mean_b > 0.0f) {
+        // substract mean value
+        (*p_data)[0] -= param_.mean_b; (*p_data)[1] -= param_.mean_g; (*p_data)[2] -= param_.mean_r;
+        if ((param_.rand_mirror != 0 && NextDouble(rnd) < 0.5f) || param_.mirror == 1) {
+          img_ = mirror(crop((*p_data) * contrast + illumination, img_[0].shape_, yy, xx)) * param_.scale;
+        } else {
+          img_ = crop((*p_data) * contrast + illumination, img_[0].shape_, yy, xx) * param_.scale ;
+        }
+      } else if (!meanfile_ready_ || param_.mean_img.length() == 0) {
+        // do not substract anything
+        if (param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) {
+          img_ = mirror(crop((*p_data), img_[0].shape_, yy, xx)) * param_.scale;
+        } else {
+          img_ = crop((*p_data), img_[0].shape_, yy, xx) * param_.scale ;
+        }
+      } else {
+        // substract mean image
+        if ((param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) || param_.mirror == 1) {
+          if (p_data->shape_ == meanimg_.shape_) {
+            img_ = mirror(crop(((*p_data) - meanimg_) * contrast + illumination, img_[0].shape_, yy, xx)) * param_.scale;
+          } else {
+            img_ = (mirror(crop((*p_data), img_[0].shape_, yy, xx) - meanimg_) * contrast + illumination) * param_.scale;
+          }
+        } else {
+          if (p_data->shape_ == meanimg_.shape_){
+            img_ = crop(((*p_data) - meanimg_) * contrast + illumination, img_[0].shape_, yy, xx) * param_.scale;
+          } else {
+            img_ = ((crop((*p_data), img_[0].shape_, yy, xx) - meanimg_) * contrast + illumination) * param_.scale;
+          }
+        }
+      }
+    }
+    out_.data = img_;
+  } 
+
+  inline void CreateMeanImg(void) {
+    if (silent_ == 0) {
+      printf("cannot find %s: create mean image, this will take some time...\n", name_meanimg_.c_str());
+    }
+    time_t start = time(NULL);
+    unsigned long elapsed = 0;
+    size_t imcnt = 1;
+
+    CHECK(this->Next_()) << "input iterator failed.";
+    meanimg_.Resize(mshadow::Shape3(shape_[0], shape_[1], shape_[2]));
+    mshadow::Copy(meanimg_, img_);
+    while (this->Next()) {
+      meanimg_ += img_; imcnt += 1;
+      elapsed = (long)(time(NULL) - start);
+      if (imcnt % 1000 == 0 && silent_ == 0) {
+        printf("\r                                                               \r");
+        printf("[%8lu] images processed, %ld sec elapsed", imcnt, elapsed);
+        fflush(stdout);
+      }
+    }
+    meanimg_ *= (1.0f / imcnt);
+
+    dmlc::Stream *fo = dmlc::Stream::Create(name_meanimg_.c_str(), "w");
+    meanimg_.SaveBinary(*fo);
+    delete fo;
+    if (silent_ == 0) {
+      printf("save mean image to %s..\n", name_meanimg_.c_str());
+    }
+    meanfile_ready_ = true;
+  }
+
+
  private:
-  // whether skip processing
-  inline bool NeedProcess(void) const {
-    if (param_.max_rotate_angle_ > 0 || param_.max_shear_ratio_ > 0.0f
-        || param_.rotate_ > 0 || rotate_list_.size() > 0) return true;
-    if (param_.min_crop_size_ > 0 && param_.max_crop_size_ > 0) return true;
+  // whether skip opencv processing
+  inline bool NeedOpencvProcess(void) const {
+    if (param_.max_rotate_angle > 0 || param_.max_shear_ratio > 0.0f
+        || param_.rotate > 0 || rotate_list_.size() > 0) return true;
+    if (param_.min_crop_size > 0 && param_.max_crop_size > 0) return true;
     return false;
   }
   // temp input space
-  mshadow::TensorContainer<cpu, 3> tmpres;
+  mshadow::TensorContainer<cpu, 3> tmpres_;
+  // mean image
+  mshadow::TensorContainer<cpu, 3> meanimg_;
+  /*! \brief temp space */
+  mshadow::TensorContainer<cpu, 3> img_;
   // temporal space
-  cv::Mat temp0, temp, temp2;
+  cv::Mat temp_;
   // rotation param
-  cv::Mat rotateM;
+  cv::Mat rotateM_;
+  // whether the mean file is ready
+  bool menafile_ready_;
   // parameters
   ImageAugmentParam param_;
   /*! \brief input shape */
diff --git a/src/io/iter_batch.h b/src/io/iter_batch.h
index f258bc2d6afd..4d95b92cce1e 100644
--- a/src/io/iter_batch.h
+++ b/src/io/iter_batch.h
@@ -16,26 +16,33 @@ namespace io {
 // Batch parameters
 struct BatchParam : public dmlc::Parameter<BatchParam> {
   /*! \brief label width */
-  index_t batch_size_;
+  index_t batch_size;
+  /*! \brief input shape */
+  // TODO: haven't modify all shape_
+  TShape input_shape;
   /*! \brief label width */
-  index_t label_width_;
+  index_t label_width;
   /*! \brief use round roubin to handle overflow batch */
-  bool round_batch_;
+  bool round_batch;
   /*! \brief skip read */
-  bool test_skipread_;
+  bool test_skipread;
   /*! \brief silent */
-  bool silent_;
+  bool silent;
   // declare parameters
   DMLC_DECLARE_PARAMETER(BatchParam) {
-    DMLC_DECLARE_FIELD(batch_size_).set_default(1)
+    DMLC_DECLARE_FIELD(batch_size)
         .describe("Batch size.");
-    DMLC_DECLARE_FIELD(label_width_).set_default(1)
+    float input_shape_default = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape).set_default(TShape(input_shape_default, input_shape_default + 3))
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("Input shape of the neural net");   
+    DMLC_DECLARE_FIELD(label_width).set_default(1)
         .describe("Label width.");
-    DMLC_DECLARE_FIELD(round_batch_).set_default(false)
+    DMLC_DECLARE_FIELD(round_batch).set_default(false)
         .describe("Use round robin to handle overflow batch.");
-    DMLC_DECLARE_FIELD(test_skipread_).set_default(false)
+    DMLC_DECLARE_FIELD(test_skipread).set_default(false)
         .describe("Skip read for testing.");
-    DMLC_DECLARE_FIELD(silent_).set_default(false)
+    DMLC_DECLARE_FIELD(silent).set_default(false)
         .describe("Whether to print batch information.");
   }
 };
@@ -54,20 +61,14 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     // init batch param, it could have similar param with 
     kwargs_left = param_.InitAllowUnknown(kwargs);
-    for (size_t i = 0; i < kwargs_left.size(); i++) {
-      if (!strcmp(kwargs_left[i].first.c_str(), "input_shape")) {
-        CHECK(sscanf(kwargs_left[i].second.c_str(), "%u,%u,%u", &shape_[1], &shape_[2], &shape_[3]) == 3)
-          << "input_shape must be three consecutive integers without space example: 1,1,200 ";
-      }
-    }
     // init base iterator
     base_->Init(kwargs);
     mshadow::Shape<4> tshape = shape_;
-    tshape[0] = param_.batch_size_;
+    tshape[0] = param_.batch_size;
     AllocSpaceDense(false);
   }
   virtual void BeforeFirst(void) {
-    if (param_.round_batch_ == 0 || num_overflow_ == 0) {
+    if (param_.round_batch == 0 || num_overflow_ == 0) {
       // otherise, we already called before first
       base_->BeforeFirst();
     } else {
@@ -79,7 +80,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     out_.num_batch_padd = 0;
 
     // skip read if in head version
-    if (param_.test_skipread_ != 0 && head_ == 0) return true;
+    if (param_.test_skipread != 0 && head_ == 0) return true;
     else this->head_ = 0;
 
     // if overflow from previous round, directly return false, until before first is called
@@ -92,17 +93,17 @@ class BatchAdaptIter: public IIterator<DataBatch> {
       out_.inst_index[top] = d.index;
       mshadow::Copy(data[top], d.data[0].get<mshadow::cpu, 3, float>());
 
-      if (++ top >= param_.batch_size_) {
+      if (++ top >= param_.batch_size) {
         out_.data[0] = TBlob(data);
         out_.data[1] = TBlob(label);
         return true;
       }
     }
     if (top != 0) {
-      if (param_.round_batch_ != 0) {
+      if (param_.round_batch != 0) {
         num_overflow_ = 0;
         base_->BeforeFirst();
-        for (; top < param_.batch_size_; ++top, ++num_overflow_) {
+        for (; top < param_.batch_size; ++top, ++num_overflow_) {
           CHECK(base_->Next()) << "number of input must be bigger than batch size";
           const DataInst& d = base_->Value();
           mshadow::Copy(label[top], d.data[1].get<mshadow::cpu, 1, float>());
@@ -111,7 +112,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
         }
         out_.num_batch_padd = num_overflow_;
       } else {
-        out_.num_batch_padd = param_.batch_size_ - top;
+        out_.num_batch_padd = param_.batch_size - top;
       }
       out_.data[0] = TBlob(data);
       out_.data[1] = TBlob(label);
@@ -128,8 +129,6 @@ class BatchAdaptIter: public IIterator<DataBatch> {
   BatchParam param_;
   /*! \brief base iterator */
   IIterator<DataInst> *base_;
-  /*! \brief input shape */
-  mshadow::Shape<4> shape_;
   /*! \brief output data */
   DataBatch out_;
   /*! \brief on first */
@@ -143,10 +142,10 @@ class BatchAdaptIter: public IIterator<DataBatch> {
   // Functions that allocate and free tensor space
   inline void AllocSpaceDense(bool pad = false) { 
     data = mshadow::NewTensor<mshadow::cpu>(shape_, 0.0f, pad);
-    mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size_, param_.label_width_);
+    mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size, param_.label_width);
     label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
-    out_.inst_index = new unsigned[param_.batch_size_];
-    out_.batch_size = param_.batch_size_;
+    out_.inst_index = new unsigned[param_.batch_size];
+    out_.batch_size = param_.batch_size;
     out_.data.resize(2);
   }
   /*! \brief auxiliary function to free space, if needed, dense only */
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 9977ddd2290c..1589fd5ad6c7 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -31,7 +31,7 @@ class ImageLabelMap {
   explicit ImageLabelMap(const char *path_imglist,
                          mshadow::index_t label_width,
                          bool silent) {
-    label_width_ = label_width;
+    label_width = label_width;
     image_index_.clear();
     label_.clear();
     idx2label_.clear();
@@ -45,7 +45,7 @@ class ImageLabelMap {
       // skip space
       while (isspace(*p) && p != end) ++p;
       image_index_.push_back(static_cast<size_t>(atol(p)));
-      for (size_t i = 0; i < label_width_; ++i) {
+      for (size_t i = 0; i < label_width; ++i) {
         // skip till space
         while (!isspace(*p) && p != end) ++p;
         // skip space
@@ -58,7 +58,7 @@ class ImageLabelMap {
     // be careful not to resize label_ afterwards
     idx2label_.reserve(image_index_.size());
     for (size_t i = 0; i < image_index_.size(); ++i) {
-      idx2label_[image_index_[i]] = dmlc::BeginPtr(label_) + i * label_width_;
+      idx2label_[image_index_[i]] = dmlc::BeginPtr(label_) + i * label_width;
     }
     if (!silent) {
       LOG(INFO) << "Loaded ImageList from " << path_imglist << ' '
@@ -70,12 +70,12 @@ class ImageLabelMap {
     std::unordered_map<size_t, real_t*>::const_iterator it
         = idx2label_.find(imid);
     CHECK(it != idx2label_.end()) << "fail to find imagelabel for id " << imid;
-    return mshadow::Tensor<cpu, 1>(it->second, mshadow::Shape1(label_width_));
+    return mshadow::Tensor<cpu, 1>(it->second, mshadow::Shape1(label_width));
   }
 
  private:
   // label with_
-  mshadow::index_t label_width_;
+  mshadow::index_t label_width;
   // image index of each record
   std::vector<size_t> image_index_;
   // real label content
@@ -87,32 +87,32 @@ class ImageLabelMap {
 // Define image record parser parameters
 struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
   /*! \brief path to image list */
-  std::string path_imglist_;
+  std::string path_imglist;
   /*! \brief path to image recordio */
-  std::string path_imgrec_;
+  std::string path_imgrec;
   /*! \brief number of threads */
-  int nthread_;
+  int nthread;
   /*! \brief whether to remain silent */
-  bool silent_;
+  bool silent;
   /*! \brief number of distributed worker */
-  int dist_num_worker_, dist_worker_rank_;
+  int dist_num_worker, dist_worker_rank;
   /*! \brief label-width */
-  int label_width_;
+  int label_width;
   // declare parameters
   DMLC_DECLARE_PARAMETER(ImageRecParserParam) {
-    DMLC_DECLARE_FIELD(path_imglist_).set_default("")
+    DMLC_DECLARE_FIELD(path_imglist).set_default("")
         .describe("Path to image list.");
-    DMLC_DECLARE_FIELD(path_imgrec_).set_default("./data/imgrec.rec")
+    DMLC_DECLARE_FIELD(path_imgrec).set_default("./data/imgrec.rec")
         .describe("Path to image record file.");
-    DMLC_DECLARE_FIELD(nthread_).set_lower_bound(1).set_default(4)
+    DMLC_DECLARE_FIELD(nthread).set_lower_bound(1).set_default(4)
         .describe("Number of thread to do parsing.");
-    DMLC_DECLARE_FIELD(label_width_).set_lower_bound(1).set_default(1)
+    DMLC_DECLARE_FIELD(label_width).set_lower_bound(1).set_default(1)
         .describe("How many labels for an image.");
-    DMLC_DECLARE_FIELD(silent_).set_default(false)
+    DMLC_DECLARE_FIELD(silent).set_default(false)
         .describe("Whether to output parser information.");
-    DMLC_DECLARE_FIELD(dist_num_worker_).set_lower_bound(1).set_default(1)
+    DMLC_DECLARE_FIELD(dist_num_worker).set_lower_bound(1).set_default(1)
         .describe("Dist worker number.");
-    DMLC_DECLARE_FIELD(dist_worker_rank_).set_default(0)
+    DMLC_DECLARE_FIELD(dist_worker_rank).set_default(0)
         .describe("Dist worker rank.");
   }
 };
@@ -170,12 +170,12 @@ inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, s
   {
     maxthread = std::max(omp_get_num_procs() / 2 - 1, 1);
   }
-  param_.nthread_ = std::min(maxthread, param_.nthread_);
-  #pragma omp parallel num_threads(param_.nthread_)
+  param_.nthread = std::min(maxthread, param_.nthread);
+  #pragma omp parallel num_threads(param_.nthread)
   {
     threadget = omp_get_num_threads();
   }
-  param_.nthread_ = threadget;
+  param_.nthread = threadget;
   // setup decoders
   for (int i = 0; i < threadget; ++i) {
     augmenters_.push_back(new ImageAugmenter());
@@ -187,27 +187,27 @@ inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, s
   // TODO, hack
   const char *ps_rank = getenv("PS_RANK");
   if (ps_rank != NULL) {
-    param_.dist_worker_rank_ = atoi(ps_rank);
+    param_.dist_worker_rank = atoi(ps_rank);
   }
 
-  if (param_.path_imglist_.length() != 0) {
-    label_map_ = new ImageLabelMap(param_.path_imglist_.c_str(),
-                                   param_.label_width_, param_.silent_ != 0);
+  if (param_.path_imglist.length() != 0) {
+    label_map_ = new ImageLabelMap(param_.path_imglist.c_str(),
+                                   param_.label_width, param_.silent != 0);
   } else {
-    param_.label_width_ = 1;
+    param_.label_width = 1;
   }
-  CHECK(param_.path_imgrec_.length() != 0)
+  CHECK(param_.path_imgrec.length() != 0)
     << "ImageRecordIOIterator: must specify image_rec";
 #if MSHADOW_DIST_PS
     // TODO move to a better place
-    param_.dist_num_worker_ = ::ps::RankSize();
-    param_.dist_worker_rank_ = ::ps::MyRank();
-    LOG(INFO) << "rank " << param_.dist_worker_rank_
-              << " in " << param_.dist_num_worker_;
+    param_.dist_num_worker = ::ps::RankSize();
+    param_.dist_worker_rank = ::ps::MyRank();
+    LOG(INFO) << "rank " << param_.dist_worker_rank
+              << " in " << param_.dist_num_worker;
 #endif
   source_ = dmlc::InputSplit::Create
-      (param_.path_imgrec_.c_str(), param_.dist_worker_rank_,
-       param_.dist_num_worker_, "recordio");
+      (param_.path_imgrec.c_str(), param_.dist_worker_rank,
+       param_.dist_num_worker, "recordio");
   // use 64 MB chunk when possible
   source_->HintChunkSize(8 << 20UL);
 }
@@ -217,12 +217,12 @@ ParseNext(std::vector<InstVector> *out_vec) {
   CHECK(source_ != NULL);
   dmlc::InputSplit::Blob chunk;
   if (!source_->NextChunk(&chunk)) return false;
-  out_vec->resize(param_.nthread_);
-  #pragma omp parallel num_threads(param_.nthread_)
+  out_vec->resize(param_.nthread);
+  #pragma omp parallel num_threads(param_.nthread)
   {
-    CHECK(omp_get_num_threads() == param_.nthread_);
+    CHECK(omp_get_num_threads() == param_.nthread);
     int tid = omp_get_thread_num();
-    dmlc::RecordIOChunkReader reader(chunk, tid, param_.nthread_);
+    dmlc::RecordIOChunkReader reader(chunk, tid, param_.nthread);
     ImageRecordIO rec;
     dmlc::InputSplit::Blob blob;
     // image data
@@ -237,7 +237,7 @@ ParseNext(std::vector<InstVector> *out_vec) {
       res = augmenters_[tid]->Process(res, prnds_[tid]);
       out.Push(static_cast<unsigned>(rec.image_index()),
                mshadow::Shape3(3, res.rows, res.cols),
-               mshadow::Shape1(param_.label_width_));
+               mshadow::Shape1(param_.label_width));
       DataInst inst = out.Back();
       // turn datainst into tensor
       mshadow::Tensor<mshadow::cpu, 3> data = inst.data[0].get<mshadow::cpu, 3, float>(); 

From bb6006376e3c431371ac1e0a358950ff0625235b Mon Sep 17 00:00:00 2001
From: tianjun <xiaotj1990327@gmail.com>
Date: Sun, 6 Sep 2015 10:05:36 +0800
Subject: [PATCH 11/15] call augprocess in base iter

---
 src/io/image_augmenter.h      | 40 +++++++++++++++++------------------
 src/io/iter_batch.h           |  4 ++--
 src/io/iter_image_recordio.cc | 24 +++++++--------------
 3 files changed, 30 insertions(+), 38 deletions(-)

diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index a81e5297d5b3..38efcc58e61a 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -263,24 +263,6 @@ class ImageAugmenter {
     return tmpres;
   }
 
-  virtual void OpencvProcess(unsigned char *dptr, size_t sz,
-                       mshadow::TensorContainer<cpu, 3> *p_data,
-                       common::RANDOM_ENGINE *prnd) {
-    cv::Mat buf(1, sz, CV_8U, dptr);
-    cv::Mat res = cv::imdecode(buf, 1);
-    res = this->OpencvProcess(res, prnd);
-    p_data->Resize(mshadow::Shape3(3, res.rows, res.cols));
-    for (index_t i = 0; i < p_data->size(1); ++i) {
-      for (index_t j = 0; j < p_data->size(2); ++j) {
-        cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
-        (*p_data)[0][i][j] = bgr[2];
-        (*p_data)[1][i][j] = bgr[1];
-        (*p_data)[2][i][j] = bgr[0];
-      }
-    }
-    res.release();
-  }
-
   void TensorProcess(mshadow::TensorContainer<cpu, 3> *p_data,
                        common::RANDOM_ENGINE *prnd) {
     img_.Resize(mshadow::Shape3((*p_data).shape_[0], param_.input_shape[1], param_.input_shape[2]));
@@ -337,7 +319,7 @@ class ImageAugmenter {
         }
       }
     }
-    out_.data = img_;
+    (*p_data) = img_;
   } 
 
   inline void CreateMeanImg(void) {
@@ -371,7 +353,25 @@ class ImageAugmenter {
     meanfile_ready_ = true;
   }
 
-
+  virtual void Process(unsigned char *dptr, size_t sz,
+                       mshadow::TensorContainer<cpu, 3> *p_data,
+                       common::RANDOM_ENGINE *prnd) {
+    cv::Mat buf(1, sz, CV_8U, dptr);
+    cv::Mat res = cv::imdecode(buf, 1);
+    res = this->OpencvProcess(res, prnd);
+    p_data->Resize(mshadow::Shape3(3, res.rows, res.cols));
+    for (index_t i = 0; i < p_data->size(1); ++i) {
+      for (index_t j = 0; j < p_data->size(2); ++j) {
+        cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
+        (*p_data)[0][i][j] = bgr[2];
+        (*p_data)[1][i][j] = bgr[1];
+        (*p_data)[2][i][j] = bgr[0];
+      }
+    }
+    res.release();
+    this->TensorProcess(p_data, prnd);
+  }
+ 
  private:
   // whether skip opencv processing
   inline bool NeedOpencvProcess(void) const {
diff --git a/src/io/iter_batch.h b/src/io/iter_batch.h
index 4d95b92cce1e..7fe8f4440513 100644
--- a/src/io/iter_batch.h
+++ b/src/io/iter_batch.h
@@ -63,7 +63,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     kwargs_left = param_.InitAllowUnknown(kwargs);
     // init base iterator
     base_->Init(kwargs);
-    mshadow::Shape<4> tshape = shape_;
+    mshadow::Shape<4> tshape = param_.input_shape;
     tshape[0] = param_.batch_size;
     AllocSpaceDense(false);
   }
@@ -141,7 +141,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
   mshadow::Tensor<mshadow::cpu, 4> data;
   // Functions that allocate and free tensor space
   inline void AllocSpaceDense(bool pad = false) { 
-    data = mshadow::NewTensor<mshadow::cpu>(shape_, 0.0f, pad);
+    data = mshadow::NewTensor<mshadow::cpu>(param_.input_shape, 0.0f, pad);
     mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size, param_.label_width);
     label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
     out_.inst_index = new unsigned[param_.batch_size];
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 1589fd5ad6c7..0c44a2346e4a 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -98,6 +98,8 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
   int dist_num_worker, dist_worker_rank;
   /*! \brief label-width */
   int label_width;
+  /*! \brief input shape */
+  TShape input_shape;
   // declare parameters
   DMLC_DECLARE_PARAMETER(ImageRecParserParam) {
     DMLC_DECLARE_FIELD(path_imglist).set_default("")
@@ -114,6 +116,10 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
         .describe("Dist worker number.");
     DMLC_DECLARE_FIELD(dist_worker_rank).set_default(0)
         .describe("Dist worker rank.");
+    float input_shape_default = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape).set_default(TShape(input_shape_default, input_shape_default + 3))
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("Input shape of the neural net");  
   }
 };
 
@@ -229,33 +235,19 @@ ParseNext(std::vector<InstVector> *out_vec) {
     InstVector &out = (*out_vec)[tid];
     out.Clear();
     while (reader.NextRecord(&blob)) {
-      // result holder
-      cv::Mat res;
-      rec.Load(blob.dptr, blob.size);
-      cv::Mat buf(1, rec.content_size, CV_8U, rec.content);
-      res = cv::imdecode(buf, 1);
-      res = augmenters_[tid]->Process(res, prnds_[tid]);
       out.Push(static_cast<unsigned>(rec.image_index()),
-               mshadow::Shape3(3, res.rows, res.cols),
+               mshadow::Shape3(param_.input_shape[0], param_.input_shape[0], param_.input_shape[0]),
                mshadow::Shape1(param_.label_width));
       DataInst inst = out.Back();
       // turn datainst into tensor
       mshadow::Tensor<mshadow::cpu, 3> data = inst.data[0].get<mshadow::cpu, 3, float>(); 
       mshadow::Tensor<mshadow::cpu, 1> label = inst.data[1].get<mshadow::cpu, 1, float>(); 
-      for (int i = 0; i < res.rows; ++i) {
-        for (int j = 0; j < res.cols; ++j) {
-          cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
-          data[0][i][j] = bgr[2];
-          data[1][i][j] = bgr[1];
-          data[2][i][j] = bgr[0];
-        }
-      }
+      augmenters_[tid]->Process(rec.content, rec.content_size, &data, prnd);
       if (label_map_ != NULL) {
         mshadow::Copy(label, label_map_->Find(rec.image_index()));
       } else {
         label[0] = rec.header.label;
       }
-      res.release();
     }
   }
   return true;

From 0913273a7ad6154218db920de4950d2e2b7466d7 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Tue, 8 Sep 2015 01:13:59 +0800
Subject: [PATCH 12/15] recio works

---
 src/common/utils.h            |   5 +-
 src/io/image_augmenter.h      | 192 ++++++++++++++++++----------------
 src/io/image_recordio.h       |   8 +-
 src/io/inst_vector.h          |  16 +--
 src/io/io.cc                  |   2 +-
 src/io/iter_batch.h           |  51 +++++----
 src/io/iter_image_recordio.cc |  99 ++++++++++++++----
 src/utils/decoder.h           | 128 -----------------------
 tests/python/test_io.py       |  70 +++++++++----
 9 files changed, 273 insertions(+), 298 deletions(-)
 delete mode 100644 src/utils/decoder.h

diff --git a/src/common/utils.h b/src/common/utils.h
index b5edb78bd6f9..29cb9f0e2f2a 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -22,9 +22,10 @@ namespace common {
  */
 typedef std::mt19937 RANDOM_ENGINE;
 // Get a double float, prnd is the pointer to a Random Engine
-#define NextDouble(prnd) std::generate_canonical<float, 10>(*prnd) 
+#define NextDouble(prnd) std::generate_canonical<float, 10>(*prnd)
 // Get a random int in [0, range)
-#define NextUInt32(range, prnd) static_cast<uint32_t>(floor(std::generate_canonical<float, 10>(*prnd) * range))
+#define NextUInt32(range, prnd) static_cast<uint32_t> \
+(floor(std::generate_canonical<float, 10>(*prnd) * range))
 
 /*!
  * \brief Helper functions.
diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index 38efcc58e61a..a4b77f5a41df 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -1,4 +1,5 @@
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file image_augmenter_opencv.hpp
  * \brief threaded version of page iterator
  * \author Naiyan Wang, Tianqi Chen, Tianjun Xiao
@@ -7,6 +8,10 @@
 #define MXNET_IO_IMAGE_AUGMENTER_H_
 
 #include <opencv2/opencv.hpp>
+#include <utility>
+#include <string>
+#include <algorithm>
+#include <vector>
 #include "../common/utils.h"
 
 namespace mxnet {
@@ -41,6 +46,7 @@ struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
   int rotate;
   /*! \brief filled color while padding */
   int fill_value;
+  // The following are params for tensor process
   /*! \brief whether to mirror the image */
   bool mirror;
   /*! \brief whether to perform rand mirror the image */
@@ -55,14 +61,17 @@ struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
   float mean_b;
   /*! \brief shape of the image data*/
   TShape input_shape;
+  /*! \brief scale on color space */
+  float scale;
   /*! \brief maximum ratio of contrast variation */
-  float max_random_contrast_;
+  float max_random_contrast;
   /*! \brief maximum value of illumination variation */
-  float max_random_illumination_;
+  float max_random_illumination;
+  /*! \brief whether to print augment info */
+  bool silent;
   // declare parameters
-  // TODO: didn't understand the range for some params
   DMLC_DECLARE_PARAMETER(ImageAugmentParam) {
-    DMLC_DECLARE_FIELD(rand_crop_).set_default(true)
+    DMLC_DECLARE_FIELD(rand_crop).set_default(true)
         .describe("Whether we de random cropping");
     DMLC_DECLARE_FIELD(crop_y_start).set_default(-1)
         .describe("Where to nonrandom crop on y");
@@ -81,7 +90,7 @@ struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
     DMLC_DECLARE_FIELD(max_random_scale).set_default(1.0f)
         .describe("Maxmum scale ratio");
     DMLC_DECLARE_FIELD(min_random_scale).set_default(1.0f)
-        .describe("Minimum scale ratio");       
+        .describe("Minimum scale ratio");
     DMLC_DECLARE_FIELD(max_img_size).set_default(1e10f)
         .describe("Maxmum image size");
     DMLC_DECLARE_FIELD(min_img_size).set_default(0.0f)
@@ -99,13 +108,16 @@ struct ImageAugmentParam : public dmlc::Parameter<ImageAugmentParam> {
     DMLC_DECLARE_FIELD(mean_r).set_default(0.0f)
         .describe("Mean value on R channel");
     DMLC_DECLARE_FIELD(mean_g).set_default(0.0f)
-        .describe("Mean value on G channel");   
+        .describe("Mean value on G channel");
     DMLC_DECLARE_FIELD(mean_b).set_default(0.0f)
         .describe("Mean value on B channel");
-    float input_shape_default = {3, 224, 224};
-    DMLC_DECLARE_FIELD(input_shape).set_default(TShape(input_shape_default, input_shape_default + 3))
+    index_t input_shape_default[] = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape)
+        .set_default(TShape(input_shape_default, input_shape_default + 3))
         .set_expect_ndim(3).enforce_nonzero()
         .describe("Input shape of the neural net");
+    DMLC_DECLARE_FIELD(scale).set_default(1.0f)
+        .describe("Scale in color space");
     DMLC_DECLARE_FIELD(max_random_contrast).set_default(0.0f)
         .describe("Maximum ratio of contrast variation");
     DMLC_DECLARE_FIELD(max_random_illumination).set_default(0.0f)
@@ -123,7 +135,6 @@ class ImageAugmenter {
   }
   virtual ~ImageAugmenter() {
   }
-  // TODO: Hack the shape and rotate list, didn't use param
   virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     kwargs_left = param_.InitAllowUnknown(kwargs);
@@ -142,7 +153,7 @@ class ImageAugmenter {
     if (param_.mean_img.length() != 0) {
       dmlc::Stream *fi = dmlc::Stream::Create(param_.mean_img.c_str(), "r", true);
       if (fi == NULL) {
-        this->CreateMeanImg();
+        meanfile_ready_ = false;
       } else {
         if (param_.silent == 0) {
           printf("loading mean image from %s\n", param_.mean_img.c_str());
@@ -174,15 +185,18 @@ class ImageAugmenter {
     float a = cos(angle / 180.0 * M_PI);
     float b = sin(angle / 180.0 * M_PI);
     // scale
-    float scale = NextDouble(prnd) * (param_.max_random_scale - param_.min_random_scale) + param_.min_random_scale;
+    float scale = NextDouble(prnd) * \
+        (param_.max_random_scale - param_.min_random_scale) + param_.min_random_scale;
     // aspect ratio
-    float ratio = NextDouble(prnd) * param_.max_aspect_ratio * 2 - param_.max_aspect_ratio + 1;
+    float ratio = NextDouble(prnd) * \
+        param_.max_aspect_ratio * 2 - param_.max_aspect_ratio + 1;
     float hs = 2 * scale / (1 + ratio);
     float ws = ratio * hs;
     // new width and height
-    float new_width = std::max(param_.min_img_size, std::min(param_.max_img_size, scale * src.cols));
-    float new_height = std::max(param_.min_img_size, std::min(param_.max_img_size, scale * src.rows));
-    //printf("%f %f %f %f %f %f %f %f %f\n", s, a, b, scale, ratio, hs, ws, new_width, new_height);
+    float new_width = std::max(param_.min_img_size, \
+            std::min(param_.max_img_size, scale * src.cols));
+    float new_height = std::max(param_.min_img_size, \
+            std::min(param_.max_img_size, scale * src.rows));
     cv::Mat M(2, 3, CV_32F);
     M.at<float>(0, 0) = hs * a - s * b * ws;
     M.at<float>(1, 0) = -b * ws;
@@ -192,42 +206,42 @@ class ImageAugmenter {
     float ori_center_height = M.at<float>(1, 0) * src.cols + M.at<float>(1, 1) * src.rows;
     M.at<float>(0, 2) = (new_width - ori_center_width) / 2;
     M.at<float>(1, 2) = (new_height - ori_center_height) / 2;
-    cv::warpAffine(src, temp, M, cv::Size(new_width, new_height),
+    cv::warpAffine(src, temp_, M, cv::Size(new_width, new_height),
                      cv::INTER_LINEAR,
                      cv::BORDER_CONSTANT,
                      cv::Scalar(param_.fill_value, param_.fill_value, param_.fill_value));
-    cv::Mat res = temp;
+    cv::Mat res = temp_;
     // crop
-    if (param_.max_crop_size != -1 || param_.min_crop_size != -1){
-      CHECK(res.cols >= param_.max_crop_size && res.rows >= param_.max_crop_size && param_.max_crop_size >= param_.min_crop_size)
+    if (param_.max_crop_size != -1 || param_.min_crop_size != -1) {
+      CHECK(res.cols >= param_.max_crop_size && res.rows >= \
+              param_.max_crop_size && param_.max_crop_size >= param_.min_crop_size)
           << "input image size smaller than max_crop_size";
-      mshadow::index_t rand_crop_size = NextUInt32(param_.max_crop_size- param_.min_crop_size+1, prnd)+ param_.min_crop_size;
+      mshadow::index_t rand_crop_size = NextUInt32(param_.max_crop_size \
+              - param_.min_crop_size+1, prnd)+ param_.min_crop_size;
       mshadow::index_t y = res.rows - rand_crop_size;
       mshadow::index_t x = res.cols - rand_crop_size;
       if (param_.rand_crop != 0) {
         y = NextUInt32(y + 1, prnd);
         x = NextUInt32(x + 1, prnd);
-      }
-      else {
+      } else {
         y /= 2; x /= 2;
       }
       cv::Rect roi(x, y, rand_crop_size, rand_crop_size);
       cv::resize(res(roi), res, cv::Size(param_.input_shape[1], param_.input_shape[2]));
-    }
-    else{
-      CHECK(static_cast<mshadow::index_t>(res.cols) >= param_.input_shape[1] && static_cast<mshadow::index_t>(res.rows) >= param_.input_shape[2]) 
-          << "input image size smaller than input shape";
-      mshadow::index_t y = res.rows - param_.input_shape[2];
-      mshadow::index_t x = res.cols - param_.input_shape[1];
-      if (param_.rand_crop_ != 0) {
-        y = NextUInt32(y + 1, prnd);
-        x = NextUInt32(x + 1, prnd);
-      }
-      else {
-        y /= 2; x /= 2;
-      }
-      cv::Rect roi(x, y, param_.input_shape[1], param_.input_shape[2]);
-      res = res(roi);
+    } else {
+        CHECK(static_cast<mshadow::index_t>(res.cols) >= param_.input_shape[1] \
+                && static_cast<mshadow::index_t>(res.rows) >= param_.input_shape[2])
+            << "input image size smaller than input shape";
+        mshadow::index_t y = res.rows - param_.input_shape[2];
+        mshadow::index_t x = res.cols - param_.input_shape[1];
+        if (param_.rand_crop != 0) {
+            y = NextUInt32(y + 1, prnd);
+            x = NextUInt32(x + 1, prnd);
+        } else {
+            y /= 2; x /= 2;
+        }
+        cv::Rect roi(x, y, param_.input_shape[1], param_.input_shape[2]);
+        res = res(roi);
     }
     return res;
   }
@@ -251,20 +265,32 @@ class ImageAugmenter {
       }
     }
     res = this->OpencvProcess(res, prnd);
-    tmpres.Resize(mshadow::Shape3(3, res.rows, res.cols));
-    for (index_t i = 0; i < tmpres.size(1); ++i) {
-      for (index_t j = 0; j < tmpres.size(2); ++j) {
+    tmpres_.Resize(mshadow::Shape3(3, res.rows, res.cols));
+    for (index_t i = 0; i < tmpres_.size(1); ++i) {
+      for (index_t j = 0; j < tmpres_.size(2); ++j) {
         cv::Vec3b bgr = res.at<cv::Vec3b>(i, j);
-        tmpres[0][i][j] = bgr[2];
-        tmpres[1][i][j] = bgr[1];
-        tmpres[2][i][j] = bgr[0];
+        tmpres_[0][i][j] = bgr[2];
+        tmpres_[1][i][j] = bgr[1];
+        tmpres_[2][i][j] = bgr[0];
       }
     }
-    return tmpres;
+    return tmpres_;
   }
 
   void TensorProcess(mshadow::TensorContainer<cpu, 3> *p_data,
                        common::RANDOM_ENGINE *prnd) {
+    // Check Newly Created mean image
+    if (meanfile_ready_ == false && param_.mean_img.length() != 0) {
+      dmlc::Stream *fi = dmlc::Stream::Create(param_.mean_img.c_str(), "r", true);
+      if (fi != NULL) {
+        if (param_.silent == 0) {
+          printf("loading mean image from %s\n", param_.mean_img.c_str());
+        }
+        meanimg_.LoadBinary(*fi);
+        delete fi;
+        meanfile_ready_ = true;
+      }
+    }
     img_.Resize(mshadow::Shape3((*p_data).shape_[0], param_.input_shape[1], param_.input_shape[2]));
     if (param_.input_shape[1] == 1) {
       img_ = (*p_data) * param_.scale;
@@ -285,72 +311,51 @@ class ImageAugmenter {
       if (p_data->size(2) != param_.input_shape[2] && param_.crop_x_start != -1) {
         xx = param_.crop_x_start;
       }
-      float contrast = NextDouble(prnd) * param_.max_random_contrast * 2 - param_.max_random_contrast + 1;
-      float illumination = NextDouble(prnd) * param_.max_random_illumination * 2 - param_.max_random_illumination;
+      float contrast = NextDouble(prnd) * param_.max_random_contrast \
+                       * 2 - param_.max_random_contrast + 1;
+      float illumination = NextDouble(prnd) * param_.max_random_illumination \
+                           * 2 - param_.max_random_illumination;
       if (param_.mean_r > 0.0f || param_.mean_g > 0.0f || param_.mean_b > 0.0f) {
         // substract mean value
-        (*p_data)[0] -= param_.mean_b; (*p_data)[1] -= param_.mean_g; (*p_data)[2] -= param_.mean_r;
-        if ((param_.rand_mirror != 0 && NextDouble(rnd) < 0.5f) || param_.mirror == 1) {
-          img_ = mirror(crop((*p_data) * contrast + illumination, img_[0].shape_, yy, xx)) * param_.scale;
+        (*p_data)[0] -= param_.mean_b;
+        (*p_data)[1] -= param_.mean_g;
+        (*p_data)[2] -= param_.mean_r;
+        if ((param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) || param_.mirror == 1) {
+          img_ = mirror(crop((*p_data) * contrast + illumination, \
+                      img_[0].shape_, yy, xx)) * param_.scale;
         } else {
-          img_ = crop((*p_data) * contrast + illumination, img_[0].shape_, yy, xx) * param_.scale ;
+          img_ = crop((*p_data) * contrast + illumination, \
+                  img_[0].shape_, yy, xx) * param_.scale;
         }
       } else if (!meanfile_ready_ || param_.mean_img.length() == 0) {
         // do not substract anything
-        if (param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) {
+        if ((param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) || param_.mirror == 1) {
           img_ = mirror(crop((*p_data), img_[0].shape_, yy, xx)) * param_.scale;
         } else {
-          img_ = crop((*p_data), img_[0].shape_, yy, xx) * param_.scale ;
+          img_ = crop((*p_data), img_[0].shape_, yy, xx) * param_.scale;
         }
       } else {
         // substract mean image
         if ((param_.rand_mirror != 0 && NextDouble(prnd) < 0.5f) || param_.mirror == 1) {
           if (p_data->shape_ == meanimg_.shape_) {
-            img_ = mirror(crop(((*p_data) - meanimg_) * contrast + illumination, img_[0].shape_, yy, xx)) * param_.scale;
+            img_ = mirror(crop(((*p_data) - meanimg_) * contrast \
+                        + illumination, img_[0].shape_, yy, xx)) * param_.scale;
           } else {
-            img_ = (mirror(crop((*p_data), img_[0].shape_, yy, xx) - meanimg_) * contrast + illumination) * param_.scale;
+            img_ = (mirror(crop((*p_data), img_[0].shape_, yy, xx) - meanimg_) \
+                    * contrast + illumination) * param_.scale;
           }
         } else {
-          if (p_data->shape_ == meanimg_.shape_){
-            img_ = crop(((*p_data) - meanimg_) * contrast + illumination, img_[0].shape_, yy, xx) * param_.scale;
+          if (p_data->shape_ == meanimg_.shape_) {
+            img_ = crop(((*p_data) - meanimg_) * contrast + illumination, \
+                    img_[0].shape_, yy, xx) * param_.scale;
           } else {
-            img_ = ((crop((*p_data), img_[0].shape_, yy, xx) - meanimg_) * contrast + illumination) * param_.scale;
+            img_ = ((crop((*p_data), img_[0].shape_, yy, xx) - meanimg_) * \
+                    contrast + illumination) * param_.scale;
           }
         }
       }
     }
     (*p_data) = img_;
-  } 
-
-  inline void CreateMeanImg(void) {
-    if (silent_ == 0) {
-      printf("cannot find %s: create mean image, this will take some time...\n", name_meanimg_.c_str());
-    }
-    time_t start = time(NULL);
-    unsigned long elapsed = 0;
-    size_t imcnt = 1;
-
-    CHECK(this->Next_()) << "input iterator failed.";
-    meanimg_.Resize(mshadow::Shape3(shape_[0], shape_[1], shape_[2]));
-    mshadow::Copy(meanimg_, img_);
-    while (this->Next()) {
-      meanimg_ += img_; imcnt += 1;
-      elapsed = (long)(time(NULL) - start);
-      if (imcnt % 1000 == 0 && silent_ == 0) {
-        printf("\r                                                               \r");
-        printf("[%8lu] images processed, %ld sec elapsed", imcnt, elapsed);
-        fflush(stdout);
-      }
-    }
-    meanimg_ *= (1.0f / imcnt);
-
-    dmlc::Stream *fo = dmlc::Stream::Create(name_meanimg_.c_str(), "w");
-    meanimg_.SaveBinary(*fo);
-    delete fo;
-    if (silent_ == 0) {
-      printf("save mean image to %s..\n", name_meanimg_.c_str());
-    }
-    meanfile_ready_ = true;
   }
 
   virtual void Process(unsigned char *dptr, size_t sz,
@@ -358,7 +363,8 @@ class ImageAugmenter {
                        common::RANDOM_ENGINE *prnd) {
     cv::Mat buf(1, sz, CV_8U, dptr);
     cv::Mat res = cv::imdecode(buf, 1);
-    res = this->OpencvProcess(res, prnd);
+    if (NeedOpencvProcess())
+        res = this->OpencvProcess(res, prnd);
     p_data->Resize(mshadow::Shape3(3, res.rows, res.cols));
     for (index_t i = 0; i < p_data->size(1); ++i) {
       for (index_t j = 0; j < p_data->size(2); ++j) {
@@ -371,7 +377,7 @@ class ImageAugmenter {
     res.release();
     this->TensorProcess(p_data, prnd);
   }
- 
+
  private:
   // whether skip opencv processing
   inline bool NeedOpencvProcess(void) const {
@@ -391,7 +397,7 @@ class ImageAugmenter {
   // rotation param
   cv::Mat rotateM_;
   // whether the mean file is ready
-  bool menafile_ready_;
+  bool meanfile_ready_;
   // parameters
   ImageAugmentParam param_;
   /*! \brief input shape */
@@ -400,5 +406,5 @@ class ImageAugmenter {
   std::vector<int> rotate_list_;
 };
 }  // namespace io
-}  // namespace cxxnet
-#endif
+}  // namespace mxnet
+#endif  // MXNET_IO_IMAGE_AUGMENTER_H_
diff --git a/src/io/image_recordio.h b/src/io/image_recordio.h
index 4aea8aabcb47..3b4fa0302435 100644
--- a/src/io/image_recordio.h
+++ b/src/io/image_recordio.h
@@ -1,4 +1,5 @@
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file image_recordio.h
  * \brief image recordio struct
  */
@@ -7,6 +8,7 @@
 
 #include <dmlc/base.h>
 #include <dmlc/io.h>
+#include <string>
 
 namespace mxnet {
 namespace io {
@@ -67,9 +69,9 @@ struct ImageRecordIO {
    */
   inline void SaveHeader(std::string *blob) const {
     blob->resize(sizeof(header));
-    std::memcpy(dmlc::BeginPtr(*blob), &header, sizeof(header));    
-  }  
-}; 
+    std::memcpy(dmlc::BeginPtr(*blob), &header, sizeof(header));
+  }
+};
 }  // namespace io
 }  // namespace mxnet
 #endif  // MXNET_IO_IMAGE_RECORDIO_H_
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
index 4ced7dd64c63..ed560fc2b5da 100644
--- a/src/io/inst_vector.h
+++ b/src/io/inst_vector.h
@@ -1,11 +1,12 @@
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file inst_vector.h
  * \brief holder of a sequence of DataInst in CPU
  *        that are not necessarily of same shape
  */
 
-#ifndef MXNET_INST_VECTOR_H_
-#define MXNET_INST_VECTOR_H_
+#ifndef MXNET_IO_INST_VECTOR_H_
+#define MXNET_IO_INST_VECTOR_H_
 
 #include <mxnet/io.h>
 #include <mxnet/base.h>
@@ -31,7 +32,7 @@ class TensorVector {
     CHECK(i + 1 < offset_.size());
     CHECK(shape_[i].Size() == offset_[i + 1] - offset_[i]);
     return mshadow::Tensor<cpu, dim, DType>
-        ((DType*)dmlc::BeginPtr(content_) + offset_[i], shape_[i]);
+        ((DType*)dmlc::BeginPtr(content_) + offset_[i], shape_[i]);  // NOLINT(*)
   }
   inline mshadow::Tensor<cpu, dim, DType> Back() const {
     return (*this)[Size() - 1];
@@ -52,6 +53,7 @@ class TensorVector {
     content_.clear();
     shape_.clear();
   }
+
  private:
   // offset of the data content
   std::vector<size_t> offset_;
@@ -66,7 +68,7 @@ class TensorVector {
  * non-uniform shape data instance in a shape efficient way
  */
 class InstVector {
- public:  
+ public:
   inline size_t Size(void) const {
     return index_.size();
   }
@@ -94,8 +96,8 @@ class InstVector {
     data_.Push(dshape);
     label_.Push(lshape);
   }
-  
- private:  
+
+ private:
   /*! \brief index of the data */
   std::vector<unsigned> index_;
   // label
@@ -105,4 +107,4 @@ class InstVector {
 };
 }  // namespace io
 }  // namespace mxnet
-#endif  // MXNET_TENSOR_VECTOR_H_
+#endif  // MXNET_IO_INST_VECTOR_H_
diff --git a/src/io/io.cc b/src/io/io.cc
index b2dbc9f8c2c5..8bfb5dbdd570 100644
--- a/src/io/io.cc
+++ b/src/io/io.cc
@@ -17,5 +17,5 @@ namespace io {
 // Register parameters in header files
 DMLC_REGISTER_PARAMETER(BatchParam);
 DMLC_REGISTER_PARAMETER(ImageAugmentParam);
-}  // namespace mxnet
 }  // namespace io
+}  // namespace mxnet
diff --git a/src/io/iter_batch.h b/src/io/iter_batch.h
index 7fe8f4440513..b45dfd3328e1 100644
--- a/src/io/iter_batch.h
+++ b/src/io/iter_batch.h
@@ -1,7 +1,8 @@
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_batch_proc-inl.hpp
  * \brief definition of preprocessing iterators that takes an iterator and do some preprocessing
- * \author Tianqi Chen
+ * \author Tianqi Chen, Tianjun Xiao
  */
 #ifndef MXNET_IO_ITER_BATCH_H_
 #define MXNET_IO_ITER_BATCH_H_
@@ -10,6 +11,9 @@
 #include <mxnet/base.h>
 #include <dmlc/logging.h>
 #include <mshadow/tensor.h>
+#include <utility>
+#include <string>
+#include <vector>
 
 namespace mxnet {
 namespace io {
@@ -18,7 +22,6 @@ struct BatchParam : public dmlc::Parameter<BatchParam> {
   /*! \brief label width */
   index_t batch_size;
   /*! \brief input shape */
-  // TODO: haven't modify all shape_
   TShape input_shape;
   /*! \brief label width */
   index_t label_width;
@@ -32,13 +35,14 @@ struct BatchParam : public dmlc::Parameter<BatchParam> {
   DMLC_DECLARE_PARAMETER(BatchParam) {
     DMLC_DECLARE_FIELD(batch_size)
         .describe("Batch size.");
-    float input_shape_default = {3, 224, 224};
-    DMLC_DECLARE_FIELD(input_shape).set_default(TShape(input_shape_default, input_shape_default + 3))
+    index_t input_shape_default[] = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape)
+        .set_default(TShape(input_shape_default, input_shape_default + 3))
         .set_expect_ndim(3).enforce_nonzero()
-        .describe("Input shape of the neural net");   
+        .describe("Input shape of the neural net");
     DMLC_DECLARE_FIELD(label_width).set_default(1)
         .describe("Label width.");
-    DMLC_DECLARE_FIELD(round_batch).set_default(false)
+    DMLC_DECLARE_FIELD(round_batch).set_default(true)
         .describe("Use round robin to handle overflow batch.");
     DMLC_DECLARE_FIELD(test_skipread).set_default(false)
         .describe("Skip read for testing.");
@@ -46,25 +50,25 @@ struct BatchParam : public dmlc::Parameter<BatchParam> {
         .describe("Whether to print batch information.");
   }
 };
-    
+
 /*! \brief create a batch iterator from single instance iterator */
 class BatchAdaptIter: public IIterator<DataBatch> {
-public:
-  BatchAdaptIter(IIterator<DataInst> *base): base_(base) {
-    num_overflow_ = 0;
-  }
+ public:
+  explicit BatchAdaptIter(IIterator<DataInst> *base): base_(base), num_overflow_(0) {}
   virtual ~BatchAdaptIter(void) {
     delete base_;
     FreeSpaceDense();
   }
   virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
-    // init batch param, it could have similar param with 
+    // init batch param, it could have similar param with
     kwargs_left = param_.InitAllowUnknown(kwargs);
     // init base iterator
     base_->Init(kwargs);
-    mshadow::Shape<4> tshape = param_.input_shape;
-    tshape[0] = param_.batch_size;
+    data_shape_[1] = param_.input_shape[0];
+    data_shape_[2] = param_.input_shape[1];
+    data_shape_[3] = param_.input_shape[2];
+    data_shape_[0] = param_.batch_size;
     AllocSpaceDense(false);
   }
   virtual void BeforeFirst(void) {
@@ -80,8 +84,10 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     out_.num_batch_padd = 0;
 
     // skip read if in head version
-    if (param_.test_skipread != 0 && head_ == 0) return true;
-    else this->head_ = 0;
+    if (param_.test_skipread != 0 && head_ == 0)
+        return true;
+    else
+        this->head_ = 0;
 
     // if overflow from previous round, directly return false, until before first is called
     if (num_overflow_ != 0) return false;
@@ -124,7 +130,8 @@ class BatchAdaptIter: public IIterator<DataBatch> {
     CHECK(head_ == 0) << "must call Next to get value";
     return out_;
   }
-private:
+
+ private:
   /*! \brief batch parameters */
   BatchParam param_;
   /*! \brief base iterator */
@@ -139,9 +146,11 @@ class BatchAdaptIter: public IIterator<DataBatch> {
   mshadow::Tensor<mshadow::cpu, 2> label;
   /*! \brief content of dense data, if this DataBatch is dense */
   mshadow::Tensor<mshadow::cpu, 4> data;
+  /*! \brief data shape */
+  mshadow::Shape<4> data_shape_;
   // Functions that allocate and free tensor space
-  inline void AllocSpaceDense(bool pad = false) { 
-    data = mshadow::NewTensor<mshadow::cpu>(param_.input_shape, 0.0f, pad);
+  inline void AllocSpaceDense(bool pad = false) {
+    data = mshadow::NewTensor<mshadow::cpu>(data_shape_, 0.0f, pad);
     mshadow::Shape<2> lshape = mshadow::Shape2(param_.batch_size, param_.label_width);
     label = mshadow::NewTensor<mshadow::cpu>(lshape, 0.0f, pad);
     out_.inst_index = new unsigned[param_.batch_size];
@@ -157,7 +166,7 @@ class BatchAdaptIter: public IIterator<DataBatch> {
       label.dptr_ = NULL;
     }
   }
-}; // class BatchAdaptIter
+};  // class BatchAdaptIter
 }  // namespace io
-}  // namespace cxxnet
+}  // namespace mxnet
 #endif  // MXNET_IO_ITER_BATCH_H_
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 0c44a2346e4a..701c28deb4c9 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -1,9 +1,9 @@
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_image_recordio-inl.hpp
  * \brief recordio data
 iterator
  */
-#include <cstdlib>
 #include <dmlc/base.h>
 #include <dmlc/io.h>
 #include <dmlc/omp.h>
@@ -13,11 +13,11 @@ iterator
 #include <dmlc/threadediter.h>
 #include <unordered_map>
 #include <vector>
+#include <cstdlib>
 #include "./inst_vector.h"
 #include "./image_recordio.h"
 #include "./image_augmenter.h"
 #include "./iter_batch.h"
-#include "../utils/decoder.h"
 namespace mxnet {
 namespace io {
 /*! \brief data structure to hold labels for images */
@@ -31,7 +31,7 @@ class ImageLabelMap {
   explicit ImageLabelMap(const char *path_imglist,
                          mshadow::index_t label_width,
                          bool silent) {
-    label_width = label_width;
+    this->label_width = label_width;
     image_index_.clear();
     label_.clear();
     idx2label_.clear();
@@ -116,10 +116,11 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
         .describe("Dist worker number.");
     DMLC_DECLARE_FIELD(dist_worker_rank).set_default(0)
         .describe("Dist worker rank.");
-    float input_shape_default = {3, 224, 224};
-    DMLC_DECLARE_FIELD(input_shape).set_default(TShape(input_shape_default, input_shape_default + 3))
+    index_t input_shape_default[] = {3, 224, 224};
+    DMLC_DECLARE_FIELD(input_shape)
+        .set_default(TShape(input_shape_default, input_shape_default + 3))
         .set_expect_ndim(3).enforce_nonzero()
-        .describe("Input shape of the neural net");  
+        .describe("Input shape of the neural net");
   }
 };
 
@@ -143,7 +144,7 @@ class ImageRecordIOParser {
   }
   // initialize the parser
   inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs);
-  
+
   // set record to the head
   inline void BeforeFirst(void) {
     return source_->BeforeFirst();
@@ -151,11 +152,12 @@ class ImageRecordIOParser {
   // parse next set of records, return an array of
   // instance vector to the user
   inline bool ParseNext(std::vector<InstVector> *out);
+
  private:
   // magic nyumber to see prng
   static const int kRandMagic = 111;
   /*! \brief parameters */
-  ImageRecParserParam param_; 
+  ImageRecParserParam param_;
   /*! \brief augmenters */
   std::vector<ImageAugmenter*> augmenters_;
   /*! \brief random samplers */
@@ -164,9 +166,12 @@ class ImageRecordIOParser {
   dmlc::InputSplit *source_;
   /*! \brief label information, if any */
   ImageLabelMap *label_map_;
+  /*! \brief temp space */
+  mshadow::TensorContainer<cpu, 3> img_;
 };
 
-inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+inline void ImageRecordIOParser::Init(
+        const std::vector<std::pair<std::string, std::string> >& kwargs) {
   // initialize parameter
   std::vector<std::pair<std::string, std::string> > kwargs_left;
   // init image rec param
@@ -185,12 +190,11 @@ inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, s
   // setup decoders
   for (int i = 0; i < threadget; ++i) {
     augmenters_.push_back(new ImageAugmenter());
-    augmenters_[i]->Init(kwargs_left);
+    augmenters_[i]->Init(kwargs);
     prnds_.push_back(new common::RANDOM_ENGINE((i + 1) * kRandMagic));
   }
-  
+
   // handling for hadoop
-  // TODO, hack
   const char *ps_rank = getenv("PS_RANK");
   if (ps_rank != NULL) {
     param_.dist_worker_rank = atoi(ps_rank);
@@ -205,7 +209,6 @@ inline void ImageRecordIOParser::Init(const std::vector<std::pair<std::string, s
   CHECK(param_.path_imgrec.length() != 0)
     << "ImageRecordIOIterator: must specify image_rec";
 #if MSHADOW_DIST_PS
-    // TODO move to a better place
     param_.dist_num_worker = ::ps::RankSize();
     param_.dist_worker_rank = ::ps::MyRank();
     LOG(INFO) << "rank " << param_.dist_worker_rank
@@ -235,14 +238,16 @@ ParseNext(std::vector<InstVector> *out_vec) {
     InstVector &out = (*out_vec)[tid];
     out.Clear();
     while (reader.NextRecord(&blob)) {
+      rec.Load(blob.dptr, blob.size);
       out.Push(static_cast<unsigned>(rec.image_index()),
-               mshadow::Shape3(param_.input_shape[0], param_.input_shape[0], param_.input_shape[0]),
+               mshadow::Shape3(param_.input_shape[0], param_.input_shape[1], param_.input_shape[2]),
                mshadow::Shape1(param_.label_width));
       DataInst inst = out.Back();
       // turn datainst into tensor
-      mshadow::Tensor<mshadow::cpu, 3> data = inst.data[0].get<mshadow::cpu, 3, float>(); 
-      mshadow::Tensor<mshadow::cpu, 1> label = inst.data[1].get<mshadow::cpu, 1, float>(); 
-      augmenters_[tid]->Process(rec.content, rec.content_size, &data, prnd);
+      mshadow::Tensor<mshadow::cpu, 3> data = inst.data[0].get<mshadow::cpu, 3, float>();
+      mshadow::Tensor<mshadow::cpu, 1> label = inst.data[1].get<mshadow::cpu, 1, float>();
+      augmenters_[tid]->Process(rec.content, rec.content_size, &img_, prnds_[tid]);
+      mshadow::Copy(data, img_);
       if (label_map_ != NULL) {
         mshadow::Copy(label, label_map_->Find(rec.image_index()));
       } else {
@@ -259,12 +264,20 @@ struct ImageRecordParam: public dmlc::Parameter<ImageRecordParam> {
   bool shuffle;
   /*! \brief random seed */
   int seed;
+  /*! \brief mean file string*/
+  std::string mean_img;
+  /*! \brief whether to remain silent */
+  bool silent;
   // declare parameters
   DMLC_DECLARE_PARAMETER(ImageRecordParam) {
     DMLC_DECLARE_FIELD(shuffle).set_default(true)
         .describe("Whether to shuffle data.");
     DMLC_DECLARE_FIELD(seed).set_default(0)
         .describe("Random Seed.");
+    DMLC_DECLARE_FIELD(mean_img).set_default("./data/mean.bin")
+        .describe("Path to image mean file.");
+    DMLC_DECLARE_FIELD(silent).set_default(false)
+        .describe("Whether to output information.");
   }
 };
 
@@ -283,8 +296,8 @@ class ImageRecordIter : public IIterator<DataInst> {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     // init image rec param
     kwargs_left = param_.InitAllowUnknown(kwargs);
-    // use the left kwarg to init parser
-    parser_.Init(kwargs_left);
+    // use the kwarg to init parser
+    parser_.Init(kwargs);
     // init thread iter
     iter_.set_max_capacity(4);
     iter_.Init([this](std::vector<InstVector> **dptr) {
@@ -294,6 +307,15 @@ class ImageRecordIter : public IIterator<DataInst> {
         return parser_.ParseNext(*dptr);
       },
       [this]() { parser_.BeforeFirst(); });
+    // Check Meanfile
+    if (param_.mean_img.length() != 0) {
+      dmlc::Stream *fi = dmlc::Stream::Create(param_.mean_img.c_str(), "r", true);
+      if (fi == NULL) {
+        this->CreateMeanImg();
+      } else {
+        delete fi;
+      }
+    }
     inst_ptr_ = 0;
   }
   virtual void BeforeFirst(void) {
@@ -320,7 +342,8 @@ class ImageRecordIter : public IIterator<DataInst> {
         }
         // shuffle instance order if needed
         if (shuffle_ != 0) {
-            std::shuffle(inst_order_.begin(), inst_order_.end(), common::RANDOM_ENGINE(kRandMagic + param_.seed));
+            std::shuffle(inst_order_.begin(), inst_order_.end(), \
+                    common::RANDOM_ENGINE(kRandMagic + param_.seed));
         }
         inst_ptr_ = 0;
       }
@@ -332,6 +355,40 @@ class ImageRecordIter : public IIterator<DataInst> {
   }
 
  private:
+  inline void CreateMeanImg(void) {
+    if (param_.silent == 0) {
+      printf("cannot find %s: create mean image, this will take some time...\n",
+              param_.mean_img.c_str());
+    }
+    time_t start = time(NULL);
+    uint64_t elapsed = 0;
+    size_t imcnt = 1;
+    this->BeforeFirst();
+    CHECK(this->Next()) << "input iterator failed.";
+    // Get the first data
+    mshadow::Tensor<mshadow::cpu, 3> img_tensor = out_.data[0].get<mshadow::cpu, 3, float>();
+    meanimg_.Resize(img_tensor.shape_);
+    mshadow::Copy(meanimg_, img_tensor);
+    while (this->Next()) {
+      mshadow::Tensor<mshadow::cpu, 3> img_tensor = out_.data[0].get<mshadow::cpu, 3, float>();
+      meanimg_ += img_tensor; imcnt += 1;
+      elapsed = (uint64_t)(time(NULL) - start);
+      if (imcnt % 1000 == 0 && param_.silent == 0) {
+        printf("\r                                                               \r");
+        printf("[%8lu] images processed, %ld sec elapsed", imcnt, elapsed);
+        fflush(stdout);
+      }
+    }
+    meanimg_ *= (1.0f / imcnt);
+
+    dmlc::Stream *fo = dmlc::Stream::Create(param_.mean_img.c_str(), "w");
+    meanimg_.SaveBinary(*fo);
+    delete fo;
+    if (param_.silent == 0) {
+      printf("save mean image to %s..\n", param_.mean_img.c_str());
+    }
+  }
+
   // random magic
   static const int kRandMagic = 111;
   // output instance
@@ -350,6 +407,8 @@ class ImageRecordIter : public IIterator<DataInst> {
   dmlc::ThreadedIter<std::vector<InstVector> > iter_;
   // parameters
   ImageRecordParam param_;
+  // mean image
+  mshadow::TensorContainer<cpu, 3> meanimg_;
 };
 DMLC_REGISTER_PARAMETER(ImageRecParserParam);
 DMLC_REGISTER_PARAMETER(ImageRecordParam);
diff --git a/src/utils/decoder.h b/src/utils/decoder.h
deleted file mode 100644
index 52db01edee23..000000000000
--- a/src/utils/decoder.h
+++ /dev/null
@@ -1,128 +0,0 @@
-#ifndef MXNET_UTILS_DECODER_H_
-#define MXNET_UTILS_DECODER_H_
-
-#include <vector>
-#if MXNET_USE_OPENCV_DECODER == 0
-  #include <jpeglib.h>
-  #include <setjmp.h>
-  #include <jerror.h>
-#endif
-#include <dmlc/logging.h>
-#include <mshadow/tensor.h>
-#if MXNET_USE_OPENCV
-  #include <opencv2/opencv.hpp>
-#endif
-
-namespace mxnet {
-namespace utils {
-
-#if MXNET_USE_OPENCV_DECODER == 0
-struct JpegDecoder {
-public:
-  JpegDecoder(void) {
-    cinfo.err = jpeg_std_error(&jerr.base);
-    jerr.base.error_exit = jerror_exit;
-    jerr.base.output_message = joutput_message;
-    jpeg_create_decompress(&cinfo);
-  }
-  // destructor
-  ~JpegDecoder(void) {
-    jpeg_destroy_decompress(&cinfo);
-  }
-
-  inline void Decode(unsigned char *ptr, size_t sz,
-                     mshadow::TensorContainer<cpu, 3, unsigned char> *p_data) {
-    if(setjmp(jerr.jmp)) {
-      jpeg_destroy_decompress(&cinfo);
-      dmlc::Error("Libjpeg fail to decode");
-    }
-    this->jpeg_mem_src(&cinfo, ptr, sz);
-    CHECK(jpeg_read_header(&cinfo, TRUE) == JPEG_HEADER_OK) << "libjpeg: failed to decode";
-    CHECK(jpeg_start_decompress(&cinfo) == true) << "libjpeg: failed to decode";
-    p_data->Resize(mshadow::Shape3(cinfo.output_height, cinfo.output_width, cinfo.output_components));
-    JSAMPROW jptr = &((*p_data)[0][0][0]);
-    while (cinfo.output_scanline < cinfo.output_height) {
-      CHECK(jpeg_read_scanlines(&cinfo, &jptr, 1) == true) << "libjpeg: failed to decode";
-      jptr += cinfo.output_width * cinfo.output_components;
-    }
-    CHECK(jpeg_finish_decompress(&cinfo) == true) << "libjpeg: failed to decode");
-  }
-private:
-  struct jerror_mgr {
-    jpeg_error_mgr base;
-    jmp_buf jmp;
-  };
-
-  METHODDEF(void) jerror_exit(j_common_ptr jinfo) {
-    jerror_mgr* err = (jerror_mgr*)jinfo->err;
-    longjmp(err->jmp, 1);
-  }
-
-  METHODDEF(void) joutput_message(j_common_ptr) {}
-
-  static boolean mem_fill_input_buffer_ (j_decompress_ptr cinfo) {
-    dmlc::Error("JpegDecoder: bad jpeg image");
-    return true;
-  }
-
-  static void mem_skip_input_data_ (j_decompress_ptr cinfo, long num_bytes_) {
-    jpeg_source_mgr *src = cinfo->src;
-    size_t num_bytes = static_cast<size_t>(num_bytes_);
-    if (num_bytes > 0) {
-      src->next_input_byte += num_bytes;
-      CHECK(src->bytes_in_buffer >= num_bytes) << "fail to decode";
-      src->bytes_in_buffer -= num_bytes;
-    } else {
-      dmlc::Error("JpegDecoder: bad jpeg image");
-
-    }
-  }
-
-  static void mem_term_source_ (j_decompress_ptr cinfo) {}
-  static void mem_init_source_ (j_decompress_ptr cinfo) {}
-  static boolean jpeg_resync_to_restart_(j_decompress_ptr cinfo, int desired) {
-    dmlc::Error("JpegDecoder: bad jpeg image");
-    return true;
-  }
-  void jpeg_mem_src (j_decompress_ptr cinfo, void* buffer, long nbytes) {
-    src.init_source = mem_init_source_;
-    src.fill_input_buffer = mem_fill_input_buffer_;
-    src.skip_input_data = mem_skip_input_data_;
-    src.resync_to_restart = jpeg_resync_to_restart_;
-    src.term_source = mem_term_source_;
-    src.bytes_in_buffer = nbytes;
-    src.next_input_byte = static_cast<JOCTET*>(buffer);
-    cinfo->src = &src;
-  }
-
-private:
-  jpeg_decompress_struct cinfo;
-  jpeg_source_mgr src;
-  jerror_mgr jerr;
-};
-#endif
-
-#if MXNET_USE_OPENCV
-struct OpenCVDecoder {
-  void Decode(unsigned char *ptr, size_t sz, mshadow::TensorContainer<cpu, 3, unsigned char> *p_data) {
-    cv::Mat buf(1, sz, CV_8U, ptr);
-    cv::Mat res = cv::imdecode(buf, 1);
-    CHECK(res.data != NULL) << "decoding fail";
-    p_data->Resize(mshadow::Shape3(res.rows, res.cols, 3));
-    for (int y = 0; y < res.rows; ++y) {
-      for (int x = 0; x < res.cols; ++x) {
-        cv::Vec3b bgr = res.at<cv::Vec3b>(y, x);
-        // store in RGB order
-        (*p_data)[y][x][2] = bgr[0];
-        (*p_data)[y][x][1] = bgr[1];
-        (*p_data)[y][x][0] = bgr[2];
-      }
-    }
-    res.release();
-  }
-};
-#endif
-} // namespace utils
-} // namespace mxnet
-
-#endif // DECODER_H
diff --git a/tests/python/test_io.py b/tests/python/test_io.py
index 991a4813033e..8706b062e5d7 100644
--- a/tests/python/test_io.py
+++ b/tests/python/test_io.py
@@ -5,28 +5,29 @@
 import pickle as pickle
 import sys
 import get_data
+from PIL import Image
 
-# prepare data
-get_data.GetMNIST_ubyte()
 
-batch_size = 100
-train_dataiter = mx.io.MNISTIter(
-        image="data/train-images-idx3-ubyte",
-        label="data/train-labels-idx1-ubyte",
-        batch_size=batch_size, shuffle=1, flat=1, silent=0, seed=10)
-val_dataiter = mx.io.MNISTIter(
-        image="data/t10k-images-idx3-ubyte",
-        label="data/t10k-labels-idx1-ubyte",
-        batch_size=batch_size, shuffle=0, flat=1, silent=0)
+def test_MNISTIter():
+    # prepare data
+    get_data.GetMNIST_ubyte()
 
-def test_MNISTIter_loop():
+    batch_size = 100
+    train_dataiter = mx.io.MNISTIter(
+            image="data/train-images-idx3-ubyte",
+            label="data/train-labels-idx1-ubyte",
+            batch_size=batch_size, shuffle=1, flat=1, silent=0, seed=10)
+    val_dataiter = mx.io.MNISTIter(
+            image="data/t10k-images-idx3-ubyte",
+            label="data/t10k-labels-idx1-ubyte",
+            batch_size=batch_size, shuffle=0, flat=1, silent=0)
+    # test_loop
     nbatch = 60000 / batch_size
     batch_count = 0
     for data, label in train_dataiter:
         batch_count += 1
     assert(nbatch == batch_count)
-
-def test_MNISTIter_reset():
+    # test_reset
     train_dataiter.reset()
     train_dataiter.iter_next()
     label_0 = train_dataiter.getlabel().numpy.flatten()
@@ -40,17 +41,40 @@ def test_MNISTIter_reset():
     assert(sum(label_0 - label_1) == 0)
 
 def test_ImageRecIter():
-    dataiter = mx.io.ImageRecordIter(path_imgrec="data/val_cxxnet.rec",
-            image_mean="data/val_cxxnet_mean.bin",
+    dataiter = mx.io.ImageRecordIter(
+            #path_imglist="data/smallset/val_cxxnet5000.txt", 
+            path_imgrec="data/val_cxxnet.rec",
+            #mean_img="data/smallset/image_net_mean.bin",
             rand_crop=True,
-            rand_mirror=True,
-            input_shape="3,224,224",
-            batch_size=128)
-
-
-
-
+            mirror=True,
+            input_shape=(3,227,227),
+            batch_size=100,
+            nthread=1,
+            seed=10)
+    # Test label read 
+    labelcount = [0 for i in range(1000)] 
+    batchcount = 0
+    for data, label in dataiter:
+        npdata = data.numpy
+        print npdata[0,:,:,:]
+        imgdata = np.zeros([227, 227, 3], dtype=np.uint8)
+        imgdata[:,:,0] = npdata[10,2,:,:]
+        imgdata[:,:,1] = npdata[10,1,:,:]
+        imgdata[:,:,2] = npdata[10,0,:,:]
+        img = Image.fromarray(imgdata)
+        imgpath = "data/smallset/test_3.jpg"
+        img.save(imgpath, format='JPEG')
 
+        exit(0)
+        print batchcount
+        sys.stdout.flush()
+        batchcount += 1
+        nplabel = label.numpy
+        for i in range(nplabel.shape[0]):
+            labelcount[int(nplabel[i])] += 1
+    # Test image
 
 
+if __name__ == '__main__':
+    test_ImageRecIter()
 

From 36aab11489acf3a9088b73abffbc18ca098fd534 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Tue, 8 Sep 2015 02:11:46 +0800
Subject: [PATCH 13/15] checked cifar, work when nthread=1

---
 dmlc-core                  |   2 +-
 example/cifar10/cifar10.py | 101 +++++++++++++++++++++++++++++++++++++
 tests/python/get_data.py   |   7 +++
 tests/python/test_io.py    |  36 ++++++++++---
 4 files changed, 139 insertions(+), 7 deletions(-)

diff --git a/dmlc-core b/dmlc-core
index 75f1950d386d..7d3c78428819 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 75f1950d386d033b0b64919017515d27e698962a
+Subproject commit 7d3c78428819dc84c4da8ae1f302ba6c6a235a5d
diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index 20694b7064da..95d7810cb7e9 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -144,8 +144,109 @@ def RandomInit(narray):
 flatten = mx.symbol.Flatten(data=pool, name="flatten1")
 fc = mx.symbol.FullyConnected(data=flatten, num_hidden=10, name="fc1")
 loss = mx.symbol.Softmax(data=fc, name="softmax")
+args_list = loss.list_arguments()
 
 data_shape = (128, 3, 28, 28)
 arg_shapes, out_shapes, aux_shapes = loss.infer_shape(data=data_shape)
 
+arg_narrays = [mx.narray.create(shape, ctx=mx.Context("gpu")) for shape in arg_shapes]
+grad_narrays = [mx.narray.create(shape, ctx=mx.Context("gpu")) for shape in arg_shapes]
 
+inputs = dict(zip(args_list, arg_narrays))
+
+name2shape = dict(zip(args_list, arg_shapes))
+pred = mx.narray.create(out_shapes[0])
+
+np.random.seed(0)
+# set random weight
+for name, narray in inputs.items():
+    if "weight" in name:
+        tmp = mx.narray.create(name2shape[name])
+        tmp.numpy[:] = np.random.uniform(-0.07, 0.07, name2shape[name])
+        tmp.copyto(narray)
+    if "bias" in name:
+        narray[:] = 0.0
+
+# bind executer
+# TODO(bing): think of a better bind interface
+executor = loss.bind(mx.Context('gpu'), arg_narrays, grad_narrays)
+# update
+
+out_narray = executor.heads()[0]
+grad_narray = mx.narray.create(out_narray.shape)
+
+epoch = 9
+lr = 0.1
+wd = 0.0004
+
+def Update(grad, weight):
+    weight[:] -= lr * grad  / batch_size
+
+block = list(zip(grad_narrays, arg_narrays))
+
+#check data
+get_data.GetCifar10()
+train_dataiter = mx.io.ImageRecordIter(
+        path_imgrec="data/cifar/train.rec",
+        mean_img="data/cifar10/cifar_mean.bin",
+        rand_crop=True,
+        rand_mirror=True,
+        input_shape=(3,28,28),
+        batch_size=128,
+        nthread=1)
+test_dataiter = mx.io.ImageRecordIter(
+        path_imgrec="data/cifar/test.rec",
+        mean_img="data/cifar/cifar_mean.bin",
+        rand_crop=True,
+        rand_mirror=True,
+        input_shape=(3,28,28),
+        batch_size=100,
+        nthread=1)
+
+tmp_label = mx.narray.create(name2shape["sm_label"])
+
+def test_cifar():
+    acc_train = 0.
+    acc_val = 0.
+    for i in range(epoch):
+        # train
+        print("Epoch %d" % i)
+        train_acc = 0.0
+        val_acc = 0.0
+        train_nbatch = 0
+        val_nbatch = 0
+        for data, label in train_dataiter:
+            data = data
+            tmp_label.numpy[:] = label.numpy.reshape(tmp_label.shape)
+            data.copyto(inputs["data"])
+            tmp_label.copyto(inputs["sm_label"])
+            executor.forward()
+            out_narray.copyto(pred)
+            train_acc += CalAcc(pred.numpy, label.numpy.flatten())
+            train_nbatch += 1
+            out_narray.copyto(grad_narray)
+            executor.backward([grad_narray])
+
+            for grad, weight in block:
+                Update(grad, weight)
+
+        # evaluate
+        for data, label in val_dataiter:
+            data = data
+            label = label.numpy.flatten()
+            data.copyto(inputs["data"])
+            executor.forward()
+            out_narray.copyto(pred)
+            val_acc += CalAcc(pred.numpy, label)
+            val_nbatch += 1
+        acc_train = train_acc / train_nbatch
+        acc_val = val_acc / val_nbatch
+        print("Train Acc: ", train_acc / train_nbatch)
+        print("Valid Acc: ", val_acc / val_nbatch)
+        train_dataiter.reset()
+        val_dataiter.reset()
+    assert(acc_train > 0.98)
+    assert(acc_val > 0.97)
+
+if __name__ == "__main__":
+    test_cifar()
diff --git a/tests/python/get_data.py b/tests/python/get_data.py
index 82d25d9072fb..828809f3e757 100644
--- a/tests/python/get_data.py
+++ b/tests/python/get_data.py
@@ -27,3 +27,10 @@ def GetMNIST_ubyte():
         os.system("wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz -P data/")
         os.system("gunzip data/t10k-labels-idx1-ubyte.gz")
 
+# download cifar
+def GetCifar10():
+    if not os.path.isdir("data/"):
+        os.system("mkdir data/")
+    if not os.path.exists('data/cifar10.zip'):
+        os.system("wget http://webdocs.cs.ualberta.ca/~bx3/data/cifar10.zip -P data/")
+        os.system("unzip data/cifar10.zip")
diff --git a/tests/python/test_io.py b/tests/python/test_io.py
index 8706b062e5d7..a348725c4c72 100644
--- a/tests/python/test_io.py
+++ b/tests/python/test_io.py
@@ -42,16 +42,14 @@ def test_MNISTIter():
 
 def test_ImageRecIter():
     dataiter = mx.io.ImageRecordIter(
-            #path_imglist="data/smallset/val_cxxnet5000.txt", 
             path_imgrec="data/val_cxxnet.rec",
-            #mean_img="data/smallset/image_net_mean.bin",
+            mean_img="data/smallset/image_net_mean.bin",
             rand_crop=True,
             mirror=True,
             input_shape=(3,227,227),
             batch_size=100,
             nthread=1,
             seed=10)
-    # Test label read 
     labelcount = [0 for i in range(1000)] 
     batchcount = 0
     for data, label in dataiter:
@@ -64,7 +62,6 @@ def test_ImageRecIter():
         img = Image.fromarray(imgdata)
         imgpath = "data/smallset/test_3.jpg"
         img.save(imgpath, format='JPEG')
-
         exit(0)
         print batchcount
         sys.stdout.flush()
@@ -72,9 +69,36 @@ def test_ImageRecIter():
         nplabel = label.numpy
         for i in range(nplabel.shape[0]):
             labelcount[int(nplabel[i])] += 1
-    # Test image
 
+def test_Cifar10Rec():
+    dataiter = mx.io.ImageRecordIter(
+            path_imgrec="data/cifar/test.rec",
+            mean_img="data/cifar/cifar10_mean.bin",
+            rand_crop=True,
+            rand_mirror=True,
+            input_shape=(3,28,28),
+            batch_size=100,
+            nthread=1)
+    labelcount = [0 for i in range(10)] 
+    batchcount = 0
+    for data, label in dataiter:
+        npdata = data.numpy
+        print npdata[0,:,:,:]
+        imgdata = np.zeros([28, 28, 3], dtype=np.uint8)
+        imgdata[:,:,0] = npdata[0,2,:,:]
+        imgdata[:,:,1] = npdata[0,1,:,:]
+        imgdata[:,:,2] = npdata[0,0,:,:]
+        img = Image.fromarray(imgdata)
+        imgpath = "data/cifar/test.jpg"
+        img.save(imgpath, format='JPEG')
+        exit(0)
+        print batchcount
+        sys.stdout.flush()
+        batchcount += 1
+        nplabel = label.numpy
+        for i in range(nplabel.shape[0]):
+            labelcount[int(nplabel[i])] += 1
 
 if __name__ == '__main__':
-    test_ImageRecIter()
+    test_Cifar10Rec()
 

From ff58b247bd65b6ea729faac1ca88543b822d58d9 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Tue, 8 Sep 2015 02:33:09 +0800
Subject: [PATCH 14/15] do not run auto test_io, just visualize test

---
 tests/python/test_io.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/python/test_io.py b/tests/python/test_io.py
index 770caf76c070..1156782bdfef 100644
--- a/tests/python/test_io.py
+++ b/tests/python/test_io.py
@@ -5,7 +5,7 @@
 import pickle as pickle
 import sys
 import get_data
-from PIL import Image
+#from PIL import Image
 
 
 def test_MNISTIter():
@@ -40,6 +40,7 @@ def test_MNISTIter():
     label_1 = train_dataiter.getlabel().numpy.flatten()
     assert(sum(label_0 - label_1) == 0)
 
+'''
 def test_ImageRecIter():
     dataiter = mx.io.ImageRecordIter(
             path_imgrec="data/val_cxxnet.rec",
@@ -82,7 +83,6 @@ def test_Cifar10Rec():
     labelcount = [0 for i in range(10)] 
     batchcount = 0
     for data, label in dataiter:
-        '''
         npdata = data.numpy
         print npdata[0,:,:,:]
         imgdata = np.zeros([28, 28, 3], dtype=np.uint8)
@@ -93,7 +93,6 @@ def test_Cifar10Rec():
         imgpath = "data/cifar/test.jpg"
         img.save(imgpath, format='JPEG')
         exit(0)
-        '''
         print "Batch: ", batchcount
         sys.stdout.flush()
         batchcount += 1
@@ -102,5 +101,4 @@ def test_Cifar10Rec():
             labelcount[int(nplabel[i])] += 1
     for i in range(10):
         assert(labelcount[i] == 1000)
-if __name__ == '__main__':
-    test_Cifar10Rec()
+'''

From 528b5a4f037224578372d3d0374edc3c82136a63 Mon Sep 17 00:00:00 2001
From: sneakerkg <xiaotj1990327@gmail.com>
Date: Tue, 8 Sep 2015 02:43:28 +0800
Subject: [PATCH 15/15] one path bug

---
 example/cifar10/cifar10.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index 95d7810cb7e9..14d9bd1b8971 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -188,7 +188,7 @@ def Update(grad, weight):
 get_data.GetCifar10()
 train_dataiter = mx.io.ImageRecordIter(
         path_imgrec="data/cifar/train.rec",
-        mean_img="data/cifar10/cifar_mean.bin",
+        mean_img="data/cifar/cifar_mean.bin",
         rand_crop=True,
         rand_mirror=True,
         input_shape=(3,28,28),