From 4fde2474cf7cbfbfbd9730b66e4087a1e1dc0881 Mon Sep 17 00:00:00 2001
From: Sergey Kosov <sergey.kosov@project-10.de>
Date: Tue, 23 Aug 2016 02:45:37 +0200
Subject: [PATCH] Sherwood lib: parallel forest trainer: bug fixing parallel
 forest trainer: enabling with PPL

---
 DGM/DGM.vcxproj                      |   3 +
 DGM/DGM.vcxproj.filters              |   9 +
 DGM/TrainNodeMsRF.cpp                |  30 +-
 DGM/TrainNodeMsRF.h                  |  12 +-
 DGM/sherwood/ForestTrainer.h         | 504 ++++++++----------
 DGM/sherwood/ParallelForestTrainer.h | 760 +++++++++++++--------------
 Demo Train/Demo Train.vcxproj.user   |   2 +-
 Demo Train/main.cpp                  |   2 +-
 doc/Doxyfile                         |   3 +-
 include/types.h                      |   5 +-
 10 files changed, 633 insertions(+), 697 deletions(-)
diff --git a/DGM/DGM.vcxproj b/DGM/DGM.vcxproj
index 08508aaa..648d80e2 100644
--- a/DGM/DGM.vcxproj
+++ b/DGM/DGM.vcxproj
@@ -58,6 +58,9 @@
     <ClInclude Include="PriorNode.h" />
     <ClInclude Include="Random.h" />
     <ClInclude Include="RForest.h" />
+    <ClInclude Include="sherwood\Forest.h" />
+    <ClInclude Include="sherwood\ForestTrainer.h" />
+    <ClInclude Include="sherwood\ParallelForestTrainer.h" />
     <ClInclude Include="sherwood\utilities\DataPointCollection.h" />
     <ClInclude Include="sherwood\utilities\FeatureResponseFunctions.h" />
     <ClInclude Include="sherwood\utilities\StatisticsAggregators.h" />
diff --git a/DGM/DGM.vcxproj.filters b/DGM/DGM.vcxproj.filters
index 62a770be..a46e45e5 100644
--- a/DGM/DGM.vcxproj.filters
+++ b/DGM/DGM.vcxproj.filters
@@ -310,6 +310,15 @@
     <ClInclude Include="..\include\parallel.h">
       <Filter>Include</Filter>
     </ClInclude>
+    <ClInclude Include="sherwood\ForestTrainer.h">
+      <Filter>External\Sherwood Utilities</Filter>
+    </ClInclude>
+    <ClInclude Include="sherwood\ParallelForestTrainer.h">
+      <Filter>External\Sherwood Utilities</Filter>
+    </ClInclude>
+    <ClInclude Include="sherwood\Forest.h">
+      <Filter>External\Sherwood Utilities</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="sherwood\utilities\FeatureResponseFunctions.cpp">
diff --git a/DGM/TrainNodeMsRF.cpp b/DGM/TrainNodeMsRF.cpp
index 3e29ec6d..60381a3e 100644
--- a/DGM/TrainNodeMsRF.cpp
+++ b/DGM/TrainNodeMsRF.cpp
@@ -1,7 +1,13 @@
 #include "TrainNodeMsRF.h"
 #include "macroses.h"
 
-//#include "sherwood\ParallelForestTrainer.h"				// for parallle computing
+#ifdef USE_SHERWOOD
+
+#include "sherwood\Sherwood.h"
+
+#ifdef USE_PPL
+#include "sherwood\ParallelForestTrainer.h"				// for parallle computing
+#endif
 
 #include "sherwood\utilities\FeatureResponseFunctions.h"
 #include "sherwood\utilities\StatisticsAggregators.h"
@@ -27,11 +33,12 @@ CTrainNodeMsRF::CTrainNodeMsRF(byte nStates, word nFeatures, int maxSamples) : C
 void CTrainNodeMsRF::init(TrainNodeMsRFParams params)
 {
 	// Some default parameters
-	m_params.MaxDecisionLevels						= params.max_decision_levels - 1;
-	m_params.NumberOfCandidateFeatures				= params.num_of_candidate_features;
-	m_params.NumberOfCandidateThresholdsPerFeature	= params.num_of_candidate_thresholds_per_feature;
-	m_params.NumberOfTrees							= params.num_ot_trees;
-	m_params.Verbose								= params.verbose;
+	m_pParams = std::auto_ptr<sw::TrainingParameters>(new sw::TrainingParameters());
+	m_pParams->MaxDecisionLevels						= params.max_decision_levels - 1;
+	m_pParams->NumberOfCandidateFeatures				= params.num_of_candidate_features;
+	m_pParams->NumberOfCandidateThresholdsPerFeature	= params.num_of_candidate_thresholds_per_feature;
+	m_pParams->NumberOfTrees							= params.num_ot_trees;
+	m_pParams->Verbose									= params.verbose;
 
 	m_pData = std::auto_ptr<sw::DataPointCollection>(new sw::DataPointCollection());
 	m_pData->m_dimension = m_nFeatures;
@@ -100,8 +107,12 @@ void CTrainNodeMsRF::train(void)
 	
 	sw::Random random;
 	sw::ClassificationTrainingContext classificationContext(m_nStates, m_nFeatures);
-	m_pForest = sw::ForestTrainer<sw::LinearFeatureResponse, sw::HistogramAggregator>::TrainForest(random, m_params, classificationContext, *m_pData);
-	//m_pForest = sw::ParallelForestTrainer<sw::LinearFeatureResponse, sw::HistogramAggregator>::TrainForest(random, m_params, classificationContext, *m_pData);
+#ifdef USE_PPL
+	// Use this function with cautions - it is not verifiied!
+	m_pForest = sw::ParallelForestTrainer<sw::LinearFeatureResponse, sw::HistogramAggregator>::TrainForest(random, *m_pParams, classificationContext, *m_pData);
+#else
+	m_pForest = sw::ForestTrainer<sw::LinearFeatureResponse, sw::HistogramAggregator>::TrainForest(random, *m_pParams, classificationContext, *m_pData);
+#endif
 }	
 
 void CTrainNodeMsRF::calculateNodePotentials(const Mat &featureVector, Mat &potential, Mat &mask) const
@@ -128,4 +139,5 @@ void CTrainNodeMsRF::calculateNodePotentials(const Mat &featureVector, Mat &pote
 	for (byte s = 0; s < m_nStates; s++) 
 		potential.at<float>(s, 0) = (1.0f - mudiness) * h.GetProbability(s);
 }
-}
\ No newline at end of file
+}
+#endif
diff --git a/DGM/TrainNodeMsRF.h b/DGM/TrainNodeMsRF.h
index 71b1407a..eba6a301 100644
--- a/DGM/TrainNodeMsRF.h
+++ b/DGM/TrainNodeMsRF.h
@@ -3,17 +3,19 @@
 #pragma once
 
 #include "TrainNode.h"
-#include "sherwood\Sherwood.h"
 
-
-namespace sw = MicrosoftResearch::Cambridge::Sherwood;
+#ifdef USE_SHERWOOD
 
 namespace MicrosoftResearch { namespace Cambridge { namespace Sherwood {
 	class LinearFeatureResponse;
 	class HistogramAggregator;
 	class DataPointCollection;
+	template<class F, class S> class Forest;
+	struct TrainingParameters;
 }}}
 
+namespace sw = MicrosoftResearch::Cambridge::Sherwood;
+
 namespace DirectGraphicalModels
 {
 	///@brief Microsoft Research Random Forest parameters
@@ -43,6 +45,7 @@ namespace DirectGraphicalModels
 	* @ingroup moduleTrainNode
 	* @brief Microsoft Sherwood Random Forest training class
 	* @details This class is based on the <a href="http://research.microsoft.com/en-us/downloads/52d5b9c3-a638-42a1-94a5-d549e2251728/">Sherwood C++ code library for decision forests</a> v.1.0.0
+	* > In order to use the Sherwood library, DGM must be build with the \b USE_SHERWOOD flag
 	* @author Sergey G. Kosov, sergey.kosov@project-10.de
 	*/
 	class CTrainNodeMsRF : public CTrainNode
@@ -100,7 +103,8 @@ namespace DirectGraphicalModels
 
 
 	private:
-		sw::TrainingParameters															m_params;
+		std::auto_ptr<sw::TrainingParameters>											m_pParams;
 		size_t																			m_maxSamples;
 	};
 }
+#endif
diff --git a/DGM/sherwood/ForestTrainer.h b/DGM/sherwood/ForestTrainer.h
index 3908b928..233c5ac3 100644
--- a/DGM/sherwood/ForestTrainer.h
+++ b/DGM/sherwood/ForestTrainer.h
@@ -1,8 +1,7 @@
-#pragma once
-
 // This file defines the ForestTraininer and TreeTrainer classes, which are
 // responsible for creating new DecisionForest instances by learning from
 // training data. Please see also ParallelForestTrainer.h.
+#pragma once
 
 #include <assert.h>
 
@@ -13,303 +12,262 @@
 #include "ProgressStream.h"
 
 #include "TrainingParameters.h"
-
 #include "Interfaces.h"
 #include "Tree.h"
 
 namespace MicrosoftResearch { namespace Cambridge { namespace Sherwood
 {
-  class Random;
-
-    /// <summary>
-  /// A decision tree training operation - used internally within TreeTrainer
-  /// to represent the operation of training a single tree.
-  /// </summary>
-  template<class F, class S>
-  class TreeTrainingOperation // where F : IFeatureResponse where S : IStatisticsAggregator<S>
-  {
-  private:
-    typedef typename std::vector<Node<F,S> >::size_type NodeIndex;
-    typedef typename std::vector<size_t>::size_type DataPointIndex;
-
-    Random& random_;
-
-    const IDataPointCollection& data_;
-
-    ITrainingContext<F, S>& trainingContext_;
-
-    TrainingParameters parameters_;
+	class Random;
+
+	/** 
+	* @brief A decision tree training operation - used internally within TreeTrainer to represent the operation of training a single tree.
+	* @tparam F IFeatureResponse
+	* @tparam S IStatisticsAggregator
+	*/
+	template<class F, class S>
+	class TreeTrainingOperation
+	{
+	private:
+		typedef typename std::vector<Node<F,S> >::size_type NodeIndex;
+		typedef typename std::vector<size_t>::size_type DataPointIndex;
+
+		Random						& random_;
+		const IDataPointCollection	& data_;
+		ITrainingContext<F, S>		& trainingContext_;
+		TrainingParameters			  parameters_;
+		S							  parentStatistics_, leftChildStatistics_, rightChildStatistics_;
+		std::vector<S>				  partitionStatistics_;
+		std::vector<float>			  responses_;
+		std::vector<size_t>			  indices_;
+		ProgressStream				  progress_;
+
+	
+	public:
+		// Constructor
+		TreeTrainingOperation(Random &random, ITrainingContext<F, S> &trainingContext, const TrainingParameters &parameters, const IDataPointCollection  &data,	ProgressStream &progress)
+			: random_(random)
+			, data_(data)
+			, trainingContext_(trainingContext)
+			, progress_(progress)
+		{
+			parameters_ = parameters;
+
+			indices_ .resize(data.Count());
+			for (DataPointIndex i = 0; i < indices_.size(); i++) indices_[i] = i;
+
+			responses_.resize(data.Count());
+
+			parentStatistics_ = trainingContext_.GetStatisticsAggregator();
+
+			leftChildStatistics_ = trainingContext_.GetStatisticsAggregator();
+			rightChildStatistics_ = trainingContext_.GetStatisticsAggregator();
+
+			partitionStatistics_.resize(parameters.NumberOfCandidateThresholdsPerFeature + 1);
+			for (unsigned int i = 0; i < parameters.NumberOfCandidateThresholdsPerFeature + 1; i++)
+				partitionStatistics_[i] = trainingContext_.GetStatisticsAggregator();
+		}
+
+		void TrainNodesRecurse(std::vector<Node<F, S> >& nodes, NodeIndex nodeIndex, DataPointIndex i0, DataPointIndex i1, int recurseDepth)
+		{
+			assert(nodeIndex < nodes.size());
+			progress_[Verbose] << Tree<F, S>::GetPrettyPrintPrefix((int) nodeIndex) << i1 - i0 << ": ";
+
+			// First aggregate statistics over the samples at the parent node
+			parentStatistics_.Clear();
+			for (DataPointIndex i = i0; i < i1; i++) parentStatistics_.Aggregate(data_, indices_[i]);
+
+			if (nodeIndex >= nodes.size() / 2) { // this is a leaf node, nothing else to do
+				nodes[nodeIndex].InitializeLeaf(parentStatistics_);
+				progress_[Verbose] << "Terminating at max depth." << std::endl;
+				return;
+			}
 
-    std::vector<size_t> indices_;
+			double	maxGain			= 0.0;
+			F		bestFeature;
+			float	bestThreshold	= 0.0f;
 
-    std::vector<float> responses_;
+			// Iterate over candidate features
+			std::vector<float> thresholds;
+			for (int f = 0; f < parameters_.NumberOfCandidateFeatures; f++) {
+				F feature = trainingContext_.GetRandomFeature(random_);
 
-    S parentStatistics_, leftChildStatistics_, rightChildStatistics_;
-    std::vector<S> partitionStatistics_;
+				// reset statistics
+				for (unsigned int b = 0; b < parameters_.NumberOfCandidateThresholdsPerFeature + 1; b++) partitionStatistics_[b].Clear(); 
 
-    ProgressStream progress_;
+				// Compute feature response per samples at this node
+				for (DataPointIndex i = i0; i < i1; i++) responses_[i] = feature.GetResponse(data_, indices_[i]);
 
-  public:
-    TreeTrainingOperation(
-      Random& random,
-      ITrainingContext<F, S>& trainingContext,
-      const TrainingParameters& parameters,
-      const IDataPointCollection& data,
-      ProgressStream& progress):
-    random_(random),
-      data_(data),
-      trainingContext_(trainingContext),
-      progress_(progress)
-    {
-      parameters_ = parameters;
+				int nThresholds;
+				if ((nThresholds = ChooseCandidateThresholds(random_, &indices_[0], i0, i1, &responses_[0], thresholds)) == 0) continue;
 
-      indices_ .resize(data.Count());
-      for (DataPointIndex i = 0; i < indices_.size(); i++)
-        indices_[i] = i;
+				// Aggregate statistics over sample partitions
+				for (DataPointIndex i = i0; i < i1; i++) {
+					int b = 0;
+					while (b < nThresholds && responses_[i] >= thresholds[b]) b++;
+					partitionStatistics_[b].Aggregate(data_, indices_[i]);
+				}
 
-      responses_.resize(data.Count());
+				for (int t = 0; t < nThresholds; t++) {
+					leftChildStatistics_.Clear();
+					rightChildStatistics_.Clear();
+					for (int p = 0; p < nThresholds + 1 /*i.e. nBins*/; p++) {
+						if (p <= t) leftChildStatistics_.Aggregate(partitionStatistics_[p]);
+						else		rightChildStatistics_.Aggregate(partitionStatistics_[p]);
+					}
+
+					// Compute gain over sample partitions
+					double gain = trainingContext_.ComputeInformationGain(parentStatistics_, leftChildStatistics_, rightChildStatistics_);
+
+					if (gain >= maxGain) {
+						maxGain = gain;
+						bestFeature = feature;
+						bestThreshold = thresholds[t];
+					}
+				} // t
+			} // f
+
+			if (maxGain == 0.0) {
+				nodes[nodeIndex].InitializeLeaf(parentStatistics_);
+				progress_[Verbose] << "Terminating with zero gain." << std::endl;
+				return;
+			}
 
-      parentStatistics_ = trainingContext_.GetStatisticsAggregator();
+			// Now reorder the data point indices using the winning feature and thresholds.
+			// Also recompute child node statistics so the client can decide whether
+			// to terminate training of this branch.
+			leftChildStatistics_.Clear();
+			rightChildStatistics_.Clear();
 
-      leftChildStatistics_ = trainingContext_.GetStatisticsAggregator();
-      rightChildStatistics_ = trainingContext_.GetStatisticsAggregator();
+			for (DataPointIndex i = i0; i < i1; i++) {
+				responses_[i] = bestFeature.GetResponse(data_, indices_[i]);
+				if (responses_[i] < bestThreshold)	leftChildStatistics_.Aggregate(data_, indices_[i]);
+				else								rightChildStatistics_.Aggregate(data_, indices_[i]);
+			}
 
-      partitionStatistics_.resize(parameters.NumberOfCandidateThresholdsPerFeature + 1);
-      for (unsigned int i = 0; i < parameters.NumberOfCandidateThresholdsPerFeature + 1; i++)
-        partitionStatistics_[i] = trainingContext_.GetStatisticsAggregator();
-    }
+			if (trainingContext_.ShouldTerminate(parentStatistics_, leftChildStatistics_, rightChildStatistics_, maxGain)) {
+				nodes[nodeIndex].InitializeLeaf(parentStatistics_);
+				progress_[Verbose] << "Terminating with no split." << std::endl;
+				return;
+			}
 
-    void TrainNodesRecurse(std::vector<Node<F, S> >& nodes, NodeIndex nodeIndex, DataPointIndex i0, DataPointIndex i1, int recurseDepth)
-    {
-      assert(nodeIndex < nodes.size());
-      progress_[Verbose] << Tree<F, S>::GetPrettyPrintPrefix((int) nodeIndex) << i1 - i0 << ": ";
+			// Otherwise this is a new decision node, recurse for children.
+			nodes[nodeIndex].InitializeSplit(bestFeature, bestThreshold, parentStatistics_);
 
-      // First aggregate statistics over the samples at the parent node
-      parentStatistics_.Clear();
-      for (DataPointIndex i = i0; i < i1; i++)
-        parentStatistics_.Aggregate(data_, indices_[i]);
+			// Now do partition sort - any sample with response greater goes left, otherwise right
+			DataPointIndex ii = Tree<F, S>::Partition(responses_, indices_, i0, i1, bestThreshold);
 
-      if (nodeIndex >= nodes.size() / 2) // this is a leaf node, nothing else to do
-      {
-        nodes[nodeIndex].InitializeLeaf(parentStatistics_);
-        progress_[Verbose] << "Terminating at max depth." << std::endl;
-        return;
-      }
+			assert(ii >= i0 && i1 >= ii);
 
-      double maxGain = 0.0;
-      F bestFeature;
-      float bestThreshold = 0.0f;
+			progress_[Verbose] << " (threshold = " << bestThreshold << ", gain = "<< maxGain << ")." << std::endl;
 
-      // Iterate over candidate features
-      std::vector<float> thresholds;
-		for (int f = 0; f < parameters_.NumberOfCandidateFeatures; f++) {
-			F feature = trainingContext_.GetRandomFeature(random_);
+			TrainNodesRecurse(nodes, nodeIndex * 2 + 1, i0, ii, recurseDepth + 1);
+			TrainNodesRecurse(nodes, nodeIndex * 2 + 2, ii, i1, recurseDepth + 1);
+		}
 
-			for (unsigned int b = 0; b < parameters_.NumberOfCandidateThresholdsPerFeature + 1; b++)
-				partitionStatistics_[b].Clear(); // reset statistics
 
-			// Compute feature response per samples at this node
-			for (DataPointIndex i = i0; i < i1; i++)
-				responses_[i] = feature.GetResponse(data_, indices_[i]);
+	private:
+		int ChooseCandidateThresholds(Random &random, size_t *dataIndices, DataPointIndex i0, DataPointIndex i1, const float *responses, std::vector<float> &thresholds)
+		{
+			thresholds.resize(parameters_.NumberOfCandidateThresholdsPerFeature + 1);
+			std::vector<float>& quantiles = thresholds; // shorthand, for code clarity - we reuse memory to avoid allocation
 
 			int nThresholds;
-			if ((nThresholds = ChooseCandidateThresholds(random_, &indices_[0], i0, i1, &responses_[0], thresholds)) == 0)
-				continue;
-
-			// Aggregate statistics over sample partitions
-			for (DataPointIndex i = i0; i < i1; i++) {
-				int b = 0;
-				while (b < nThresholds && responses_[i] >= thresholds[b])
-				b++;
-
-				partitionStatistics_[b].Aggregate(data_, indices_[i]);
+			// If there are enough response values...
+			if (i1 - i0 > parameters_.NumberOfCandidateThresholdsPerFeature) {
+				// ...make a random draw of NumberOfCandidateThresholdsPerFeature+1 response values
+				nThresholds = parameters_.NumberOfCandidateThresholdsPerFeature;
+				for (int i = 0; i < nThresholds + 1; i++)
+				quantiles[i] = responses[(int) random.Next((int)i0, (int)i1)]; // sample randomly from all responses
+			} else {
+				// ...otherwise use all response values.
+				nThresholds = (int)i1 - (int)i0 - 1;
+				std::copy(&responses[i0], &responses[i1], quantiles.begin());
 			}
 
-			for (int t = 0; t < nThresholds; t++) {
-				leftChildStatistics_.Clear();
-				rightChildStatistics_.Clear();
-				for (int p = 0; p < nThresholds + 1 /*i.e. nBins*/; p++) {
-					if (p <= t) leftChildStatistics_.Aggregate(partitionStatistics_[p]);
-					else		rightChildStatistics_.Aggregate(partitionStatistics_[p]);
-				}
+			// Sort the response values to form approximate quantiles.
+			std::sort(quantiles.begin(), quantiles.end());
+
+			if (quantiles[0] == quantiles[nThresholds]) return 0;   // all sampled response values were the same
+
+			// Compute n candidate thresholds by sampling in between n+1 approximate quantiles
+			for (int i = 0; i < nThresholds; i++) thresholds[i] = quantiles[i] + (float)(random_.NextDouble() * (quantiles[i + 1] - quantiles[i]));
+
+			return nThresholds;
+		}
+	};
+
+	/**
+	* @brief Used to train decision trees.
+	* @tparam F IFeatureResponse
+	* @tparam S IStatisticsAggregator
+	*/
+	template<class F, class S> class TreeTrainer
+	{
+	public:
+		/**
+		* @brief Train a new decision tree given some training data and a training problem described by an ITrainingContext instance.
+		* @param random The single random number generator.
+		* @param progress Progress reporting target.
+		* @param context The %ITrainingContext instance by which the training framework interacts with the training data. Implemented within client code.
+		* @param parameters Training parameters.
+		* @param data The training data.
+		* @returns A new decision tree.
+		*/
+		static std::auto_ptr<Tree<F, S> > TrainTree(Random &random, ITrainingContext<F, S> &context, const TrainingParameters &parameters, const IDataPointCollection &data, ProgressStream *progress = nullptr)
+		{
+			ProgressStream defaultProgress(std::cout, parameters.Verbose ? Verbose : Interest);
+			if (progress == 0) progress = &defaultProgress;
+
+			TreeTrainingOperation<F, S> trainingOperation(random, context, parameters, data, *progress);
+
+			std::auto_ptr<Tree<F, S> > tree = std::auto_ptr<Tree<F, S>>(new Tree<F,S>(parameters.MaxDecisionLevels));
+
+			(*progress)[Verbose] << std::endl;
+
+			trainingOperation.TrainNodesRecurse(tree->GetNodes(), 0, 0, data.Count(), 0);  // will recurse until termination criterion is met
+
+			(*progress)[Verbose] << std::endl;
+
+			tree->CheckValid();
+
+			return tree;
+		}
+	};
+
+	/**
+	* @brief Learns new decision forests from training data.
+	* @tparam F IFeatureResponse
+	* @tparam S IStatisticsAggregator
+	*/
+	template<class F, class S> class ForestTrainer 
+	{
+	public:
+		/**
+		* @brief Train a new decision forest given some training data and a training problem described by an instance of the ITrainingContext interface.
+		* @param random %Random number generator.
+		* @param parameters Training parameters.
+		* @param context An %ITrainingContext instance describing the training problem, e.g. classification, density estimation, etc.
+		* @param data The training data.
+		* @param progress The progress.
+		* @returns A new decision forest.
+		*/
+		static std::auto_ptr<Forest<F,S> > TrainForest(Random &random, const TrainingParameters &parameters, ITrainingContext<F,S> &context, const IDataPointCollection &data, ProgressStream *progress = nullptr)
+		{
+			ProgressStream defaultProgress(std::cout, parameters.Verbose ? Verbose : Interest);
+			if(progress == 0)	progress=&defaultProgress;
+
+			std::auto_ptr<Forest<F,S> > forest = std::auto_ptr<Forest<F,S> >(new Forest<F,S>());
+
+			for (int t = 0; t < parameters.NumberOfTrees; t++) {
+				(*progress)[Interest] << "\rTraining tree "<< t << "...";
+
+				std::auto_ptr<Tree<F, S> > tree = TreeTrainer<F, S>::TrainTree(random, context, parameters, data, progress);
+				forest->AddTree(tree);
+			}
 
-				// Compute gain over sample partitions
-				double gain = trainingContext_.ComputeInformationGain(parentStatistics_, leftChildStatistics_, rightChildStatistics_);
+			(*progress)[Interest] << "\rTrained " << parameters.NumberOfTrees << " trees.         " << std::endl;
 
-				if (gain >= maxGain) {
-					maxGain = gain;
-					bestFeature = feature;
-					bestThreshold = thresholds[t];
-				}
-			} // t
-		} // f
-
-      if (maxGain == 0.0)
-      {
-        nodes[nodeIndex].InitializeLeaf(parentStatistics_);
-        progress_[Verbose] << "Terminating with zero gain." << std::endl;
-        return;
-      }
-
-      // Now reorder the data point indices using the winning feature and thresholds.
-      // Also recompute child node statistics so the client can decide whether
-      // to terminate training of this branch.
-      leftChildStatistics_.Clear();
-      rightChildStatistics_.Clear();
-
-      for (DataPointIndex i = i0; i < i1; i++)
-      {
-        responses_[i] = bestFeature.GetResponse(data_, indices_[i]);
-        if (responses_[i] < bestThreshold)
-          leftChildStatistics_.Aggregate(data_, indices_[i]);
-        else
-          rightChildStatistics_.Aggregate(data_, indices_[i]);
-      }
-
-      if (trainingContext_.ShouldTerminate(parentStatistics_, leftChildStatistics_, rightChildStatistics_, maxGain))
-      {
-        nodes[nodeIndex].InitializeLeaf(parentStatistics_);
-        progress_[Verbose] << "Terminating with no split." << std::endl;
-        return;
-      }
-
-      // Otherwise this is a new decision node, recurse for children.
-      nodes[nodeIndex].InitializeSplit(bestFeature, bestThreshold, parentStatistics_);
-
-      // Now do partition sort - any sample with response greater goes left, otherwise right
-      DataPointIndex ii = Tree<F, S>::Partition(responses_, indices_, i0, i1, bestThreshold);
-
-      assert(ii >= i0 && i1 >= ii);
-
-      progress_[Verbose] << " (threshold = " << bestThreshold << ", gain = "<< maxGain << ")." << std::endl;
-
-      TrainNodesRecurse(nodes, nodeIndex * 2 + 1, i0, ii, recurseDepth + 1);
-      TrainNodesRecurse(nodes, nodeIndex * 2 + 2, ii, i1, recurseDepth + 1);
-    }
-
-  private:
-    int ChooseCandidateThresholds(Random &random, size_t * dataIndices, DataPointIndex i0, DataPointIndex i1, const float *responses, std::vector<float> &thresholds)
-    {
-      thresholds.resize(parameters_.NumberOfCandidateThresholdsPerFeature + 1);
-      std::vector<float>& quantiles = thresholds; // shorthand, for code clarity - we reuse memory to avoid allocation
-
-      int nThresholds;
-      // If there are enough response values...
-      if (i1 - i0 > parameters_.NumberOfCandidateThresholdsPerFeature)
-      {
-        // ...make a random draw of NumberOfCandidateThresholdsPerFeature+1 response values
-        nThresholds = parameters_.NumberOfCandidateThresholdsPerFeature;
-        for (int i = 0; i < nThresholds + 1; i++)
-          quantiles[i] = responses[(int) random.Next((int)i0, (int)i1)]; // sample randomly from all responses
-      }
-      else
-      {
-        // ...otherwise use all response values.
-        nThresholds = (int)i1 - (int)i0 - 1;
-        std::copy(&responses[i0], &responses[i1], quantiles.begin());
-      }
-
-      // Sort the response values to form approximate quantiles.
-      std::sort(quantiles.begin(), quantiles.end());
-
-      if (quantiles[0] == quantiles[nThresholds])
-        return 0;   // all sampled response values were the same
-
-      // Compute n candidate thresholds by sampling in between n+1 approximate quantiles
-      for (int i = 0; i < nThresholds; i++)
-        thresholds[i] = quantiles[i] + (float)(random_.NextDouble() * (quantiles[i + 1] - quantiles[i]));
-
-      return nThresholds;
-    }
-  };
-
-  /// <summary>
-  /// Used to train decision trees.
-  /// </summary>
-  template<class F, class S>
-  class TreeTrainer
-  {
-  public:
-    /// <summary>
-    /// Train a new decision tree given some training data and a training
-    /// problem described by an ITrainingContext instance.
-    /// </summary>
-    /// <param name="random">The single random number generator.</param>
-    /// <param name="progress">Progress reporting target.</param>
-    /// <param name="context">The ITrainingContext instance by which
-    /// the training framework interacts with the training data.
-    /// Implemented within client code.</param>
-    /// <param name="parameters">Training parameters.</param>
-    /// <param name="data">The training data.</param>
-    /// <returns>A new decision tree.</returns>
-    static std::auto_ptr<Tree<F, S> > TrainTree(
-      Random& random,
-      ITrainingContext<F, S>& context,
-      const TrainingParameters& parameters,
-      const IDataPointCollection& data,
-      ProgressStream* progress=0)
-    {
-      ProgressStream defaultProgress(std::cout, parameters.Verbose? Verbose:Interest);
-      if(progress==0)
-        progress=&defaultProgress;
-
-      TreeTrainingOperation<F, S> trainingOperation(random, context, parameters, data, *progress);
-
-      std::auto_ptr<Tree<F, S> > tree = std::auto_ptr<Tree<F, S> >(new Tree<F,S>(parameters.MaxDecisionLevels));
-
-      (*progress)[Verbose] << std::endl;
-
-      trainingOperation.TrainNodesRecurse(tree->GetNodes(), 0, 0, data.Count(), 0);  // will recurse until termination criterion is met
-
-      (*progress)[Verbose] << std::endl;
-
-      tree->CheckValid();
-
-      return tree;
-    }
-  };
-
-  /// <summary>
-  /// Learns new decision forests from training data.
-  /// </summary>
-  template<class F, class S>
-  class ForestTrainer // where F:IFeatureResponse where S:IStatisticsAggregator<S>
-  {
-  public:
-    /// <summary>
-    /// Train a new decision forest given some training data and a training
-    /// problem described by an instance of the ITrainingContext interface.
-    /// </summary>
-    /// <param name="random">Random number generator.</param>
-    /// <param name="parameters">Training parameters.</param>
-    /// <param name="context">An ITrainingContext instance describing
-    /// the training problem, e.g. classification, density estimation, etc. </param>
-    /// <param name="data">The training data.</param>
-	/// <param name="progress">The progress.</param>
-    /// <returns>A new decision forest.</returns>
-    static std::auto_ptr<Forest<F,S> > TrainForest(
-      Random& random,
-      const TrainingParameters& parameters,
-      ITrainingContext<F,S>& context,
-      const IDataPointCollection& data,
-      ProgressStream* progress=0)
-    {
-      ProgressStream defaultProgress(std::cout, parameters.Verbose? Verbose:Interest);
-      if(progress==0)
-        progress=&defaultProgress;
-
-      std::auto_ptr<Forest<F,S> > forest = std::auto_ptr<Forest<F,S> >(new Forest<F,S>());
-
-      for (int t = 0; t < parameters.NumberOfTrees; t++)
-      {
-        (*progress)[Interest] << "\rTraining tree "<< t << "...";
-
-        std::auto_ptr<Tree<F, S> > tree = TreeTrainer<F, S>::TrainTree(random, context, parameters, data, progress);
-        forest->AddTree(tree);
-      }
-      (*progress)[Interest] << "\rTrained " << parameters.NumberOfTrees << " trees.         " << std::endl;
-
-      return forest;
-    }
-  };
+			return forest;
+		}
+	};
 } } }
diff --git a/DGM/sherwood/ParallelForestTrainer.h b/DGM/sherwood/ParallelForestTrainer.h
index cfbfb35e..456be9a2 100644
--- a/DGM/sherwood/ParallelForestTrainer.h
+++ b/DGM/sherwood/ParallelForestTrainer.h
@@ -1,406 +1,354 @@
-#pragma once
-
-// This file defines the ParallelForestTrainer and ParallelTreeTraininer classes,
-// which are responsible for creating new Tree instances by learning from
-// training data. These classes have almost identical interfaces to ForestTrainer
-// and TreeTrainer, but allow candidate feature evaluation to be shared over a
-// specified maximum number of threads.
-
-// *** NOTE *** Compiling this header requires OpenMP.
-
-#include <assert.h>
-
-#include <vector>
-#include <string>
-#include <algorithm>
-
-#include <omp.h>
-
-#include "ProgressStream.h"
-
-#include "TrainingParameters.h"
-#include "Interfaces.h"
-#include "Tree.h"
-
-namespace MicrosoftResearch { namespace Cambridge { namespace Sherwood
-{
-  class Random;
-  
-  /// <summary>
-  /// A decision tree training operation in which candidate feature response
-  /// function evaluation is distributed over multiple threads - used
-  /// internally within ParallelTreeTrainer to encapsulate the training a single tree.
-  /// </summary>
-  template<class F, class S>
-  class ParallelTreeTrainingOperation // where F : IFeatureResponse where S : IStatisticsAggregator<S>
-  {
-  private:
-    typedef typename std::vector<Node<F,S> >::size_type NodeIndex;
-    typedef typename std::vector<unsigned int>::size_type DataPointIndex;
-
-    Random& random_;
-
-    const IDataPointCollection& data_;
-
-    ITrainingContext<F, S>& trainingContext_;
-
-    TrainingParameters parameters_;
-
-    int maxThreads_;
-
-    S parentStatistics_, leftChildStatistics_, rightChildStatistics_;
-
-    std::vector<S> partitionStatistics_;
-    std::vector<float> responses_;
-    std::vector<unsigned int> indices_;
-
-    ProgressStream progress_;
-
-    class ThreadLocalData
-    {
-    public:
-      double maxGain;
-      F bestFeature;
-      float bestThreshold;
-
-      S parentStatistics_, leftChildStatistics_, rightChildStatistics_;
-
-      std::vector<S> partitionStatistics_;
-      std::vector<float> responses_;
-      std::vector<float> thresholds;
-
-      Random random_;
-
-      ThreadLocalData()
-      {
-
-      }
-
-      ThreadLocalData(Random& random, ITrainingContext<F,S>& trainingContext_, const TrainingParameters& parameters, IDataPointCollection const & data):random_(random.Next())
-      {
-        maxGain = 0.0;
-        bestThreshold = 0.0;
-        parentStatistics_ = trainingContext_.GetStatisticsAggregator();
-
-        leftChildStatistics_ = trainingContext_.GetStatisticsAggregator();
-        rightChildStatistics_ = trainingContext_.GetStatisticsAggregator();
-
-        partitionStatistics_.resize(parameters.NumberOfCandidateThresholdsPerFeature + 1);
-        for (unsigned int i = 0; i < parameters.NumberOfCandidateThresholdsPerFeature + 1; i++)
-          partitionStatistics_[i] = trainingContext_.GetStatisticsAggregator();
-
-        responses_.resize(data.Count());
-        // thresholds_ will be resized() in ChooseCandidateThresholds()
-      }
-
-      void Clear()
-      {
-        maxGain = 0.0;
-        bestFeature = F();
-        bestThreshold = 0.0f;
-      }
-    };
-
-    std::vector<ThreadLocalData > threadLocalData_;
-
-  public:
-    ParallelTreeTrainingOperation(
-      Random& random,
-      ITrainingContext<F, S>& trainingContext,
-      const TrainingParameters& parameters,
-      int maxThreads,
-      const IDataPointCollection& data,
-      ProgressStream& progress):
-    random_(random),
-    data_(data),
-    maxThreads_(maxThreads),
-    trainingContext_(trainingContext),
-    progress_(progress)
-    {
-      parameters_ = parameters;
-
-      indices_ .resize(data.Count());
-      for (DataPointIndex i = 0; i < indices_.size(); i++)
-        indices_[i] = i;
-
-      threadLocalData_.resize(maxThreads_);
-      for (int threadIndex = 0; threadIndex < maxThreads_; threadIndex++)
-        // Note use of placement new operator to initialize already-allocated memory
-        new (&threadLocalData_[threadIndex]) ThreadLocalData(random, trainingContext_, parameters_, data_);
-    }
-
-    void TrainNodesRecurse(std::vector<Node<F, S> >& nodes, NodeIndex nodeIndex, DataPointIndex i0, DataPointIndex i1, int recurseDepth)
-    {
-      assert(nodeIndex < nodes.size());
-      progress_[Verbose] << Tree<F, S>::GetPrettyPrintPrefix(nodeIndex) << i1 - i0 << ": ";
-
-      // First aggregate statistics over the samples at the parent node
-      parentStatistics_.Clear();
-      for (DataPointIndex i = i0; i < i1; i++)
-        parentStatistics_.Aggregate(data_, indices_[i]);
-
-      // Copy parent statistics to thread local storage in case client IStatisticsAggregator implementations are not reentrant
-      for (int t = 0; t < maxThreads_; t++)
-        threadLocalData_[t].parentStatistics_ = parentStatistics_.DeepClone();
-
-      if (nodeIndex >= nodes.size() / 2) // this is a leaf node, nothing else to do
-      {
-        nodes[nodeIndex].InitializeLeaf(parentStatistics_);
-        progress_[Verbose] << "Terminating at max depth." << std::endl;
-        return;
-      }
-
-      #pragma omp parallel for
-      for(int threadIndex=0; threadIndex < maxThreads_; threadIndex++)
-      {
-        ThreadLocalData& tl = threadLocalData_[threadIndex]; // shorthand
-
-        tl.Clear();
-
-        // Iterate over candidate features
-        std::vector<float> thresholds;
-        for (int f = 0; f < parameters_.NumberOfCandidateFeatures/maxThreads_; f++)
-        {
-          F feature = trainingContext_.GetRandomFeature(tl.random_);
-
-          for (unsigned int b = 0; b < parameters_.NumberOfCandidateThresholdsPerFeature + 1; b++)
-            tl.partitionStatistics_[b].Clear(); // reset statistics
-
-          // Compute feature response per samples at this node
-          for (DataPointIndex i = i0; i < i1; i++)
-            tl.responses_[i] = feature.GetResponse(data_, indices_[i]);
-
-          int nThresholds;
-          if ((nThresholds = ChooseCandidateThresholds(tl.random_, &indices_[0], i0, i1, &tl.responses_[0], tl.thresholds)) == 0)
-            continue;
-
-          // Aggregate statistics over sample partitions
-          for (DataPointIndex i = i0; i < i1; i++)
-          {
-            int b = 0;
-            while (b < nThresholds && tl.responses_[i] >= tl.thresholds[b])
-              b++;
-
-            tl.partitionStatistics_[b].Aggregate(data_, indices_[i]);
-          }
-
-          for (int t = 0; t < nThresholds; t++)
-          {
-            tl.leftChildStatistics_.Clear();
-            tl.rightChildStatistics_.Clear();
-            for (int p = 0; p < nThresholds + 1 /*i.e. nBins*/; p++)
-            {
-              if (p <= t)
-                tl.leftChildStatistics_.Aggregate(tl.partitionStatistics_[p]);
-              else
-                tl.rightChildStatistics_.Aggregate(tl.partitionStatistics_[p]);
-            }
-
-            // Compute gain over sample partitions
-            double gain = trainingContext_.ComputeInformationGain(tl.parentStatistics_, tl.leftChildStatistics_,tl. rightChildStatistics_);
-
-            if (gain >= tl.maxGain)
-            {
-              tl.maxGain = gain;
-              tl.bestFeature = feature;
-              tl.bestThreshold = tl.thresholds[t];
-            }
-          }
-        }
-      }
-
-      // Now merge over threads.
-      double maxGain = 0.0;
-      F bestFeature;
-      float bestThreshold=0.0;
-
-      for (int threadIndex = 0; threadIndex < maxThreads_; threadIndex++)
-      {
-        ThreadLocalData& tl = threadLocalData_[threadIndex];
-        if (tl.maxGain > maxGain)
-        {
-          maxGain = tl.maxGain;
-          bestFeature = tl.bestFeature;
-          bestThreshold = tl.bestThreshold;
-        }
-      }
-
-      if (maxGain == 0.0)
-      {
-        nodes[nodeIndex].InitializeLeaf(parentStatistics_);
-        progress_[Verbose] << "Terminating with zero gain." << std::endl;
-        return;
-      }
-
-      // Now reorder the data point indices using the winning feature and thresholds.
-      // Also recompute child node statistics so the client can decide whether
-      // to terminate training of this branch.
-      leftChildStatistics_.Clear();
-      rightChildStatistics_.Clear();
-
-      for (DataPointIndex i = i0; i < i1; i++)
-      {
-        responses_[i] = bestFeature.GetResponse(data_, indices_[i]);
-        if (responses_[i] < bestThreshold)
-          leftChildStatistics_.Aggregate(data_, indices_[i]);
-        else
-          rightChildStatistics_.Aggregate(data_, indices_[i]);
-      }
-
-      if (trainingContext_.ShouldTerminate(parentStatistics_, leftChildStatistics_, rightChildStatistics_, maxGain))
-      {
-        nodes[nodeIndex].InitializeLeaf(parentStatistics_);
-        progress_[Verbose] << "Terminating with no split." << std::endl;
-        return;
-      }
-
-      // Otherwise this is a new decision node, recurse for children.
-      nodes[nodeIndex].InitializeSplit(bestFeature, bestThreshold, parentStatistics_);
-
-      // Now do partition sort - any sample with response greater goes left, otherwise right
-      DataPointIndex ii = Tree<F, S>::Partition(responses_, indices_, i0, i1, bestThreshold);
-
-      assert(ii >= i0 && i1 >= ii);
-
-      progress_[Verbose] << " (threshold = " << bestThreshold << ", gain = "<< maxGain << ")." << std::endl;
-
-      TrainNodesRecurse(nodes, nodeIndex * 2 + 1, i0, ii, recurseDepth + 1);
-      TrainNodesRecurse(nodes, nodeIndex * 2 + 2, ii, i1, recurseDepth + 1);
-    }
-
-  private:
-    int ChooseCandidateThresholds (
-      Random& random,
-      unsigned int* dataIndices, 
-      DataPointIndex i0,
-      DataPointIndex i1,
-      const float* responses,
-      std::vector<float>& thresholds )
-    {
-      thresholds.resize(parameters_.NumberOfCandidateThresholdsPerFeature + 1);
-      std::vector<float>& quantiles = thresholds; // shorthand, for code clarity - we reuse memory to avoid allocation
-
-      int nThresholds;
-      // If there are enough response values...
-      if (i1 - i0 > parameters_.NumberOfCandidateThresholdsPerFeature)
-      {
-        // ...make a random draw of NumberOfCandidateThresholdsPerFeature+1 response values
-        nThresholds = parameters_.NumberOfCandidateThresholdsPerFeature;
-        for (int i = 0; i < nThresholds + 1; i++)
-          quantiles[i] = responses[random.Next(i0, i1)]; // sample randomly from all responses
-      }
-      else
-      {
-        // ...otherwise use all response values.
-        nThresholds = i1 - i0 - 1;
-        std::copy(&responses[i0], &responses[i1], quantiles.begin());
-      }
-
-      // Sort the response values to form approximate quantiles.
-      std::sort(quantiles.begin(), quantiles.end());
-
-      if (quantiles[0] == quantiles[nThresholds])
-        return 0;   // all sampled response values were the same
-
-      // Compute n candidate thresholds by sampling in between n+1 approximate quantiles
-      for (int i = 0; i < nThresholds; i++)
-        thresholds[i] = quantiles[i] + (float)(random_.NextDouble() * (quantiles[i + 1] - quantiles[i]));
-
-      return nThresholds;
-    }
-  };
-
-  
-  /// <summary>
-  /// Used for multi-threaded decision tree training. Candidate feature
-  /// response function evaluation is distributed over multiple threads.
-  /// </summary>
-  template<class F, class S>
-  class ParallelTreeTrainer
-  {
-  public:
-    /// <summary>
-    /// Train a new decision tree given some training data and a training
-    /// problem described by an ITrainingContext instance.
-    /// </summary>
-    /// <param name="random">The single random number generator.</param>
-    /// <param name="progress">Progress reporting target.</param>
-    /// <param name="context">The ITrainingContext instance by which
-    /// the training framework interacts with the training data.
-    /// Implemented within client code.</param>
-    /// <param name="parameters">Training parameters.</param>
-    /// <param name="maxThreads">The maximum number of threads to use.</param>
-    /// <param name="data">The training data.</param>
-    /// <returns>A new decision tree.</returns>
-    static std::auto_ptr<Tree<F, S> > TrainTree(
-      Random& random,
-      ITrainingContext<F, S>& context,
-      const TrainingParameters& parameters,
-      int maxThreads,
-      const IDataPointCollection& data,
-      ProgressStream* progress=0)
-    {
-      ProgressStream defaultProgress(std::cout, parameters.Verbose? Verbose:Interest);
-      if(progress==0)
-        progress=&defaultProgress;
-
-      ParallelTreeTrainingOperation<F, S> trainingOperation(random, context, parameters, maxThreads, data, *progress);
-
-      std::auto_ptr<Tree<F, S> > tree = std::auto_ptr<Tree<F, S> >(new Tree<F,S>(parameters.MaxDecisionLevels));
-
-      (*progress)[Verbose] << std::endl;
-
-      trainingOperation.TrainNodesRecurse(tree->GetNodes(), 0, 0, data.Count(), 0);  // will recurse until termination criterion is met
-
-      (*progress)[Verbose] << std::endl;
-
-      tree->CheckValid();
-
-      return tree;
-    }
-  };
-
-  /// <summary>
-  /// Learns new decision forests from training data.
-  /// </summary>
-  template<class F, class S>
-  class ParallelForestTrainer // where F:IFeatureResponse where S:IStatisticsAggregator<S>
-  {
-  public:
-    /// <summary>
-    /// Train a new decision forest given some training data and a training
-    /// problem described by an instance of the ITrainingContext interface.
-    /// </summary>
-    /// <param name="random">Random number generator.</param>
-    /// <param name="parameters">Training parameters.</param>
-    /// <param name="context">An ITrainingContext instance describing
-    /// the training problem, e.g. classification, density estimation, etc. </param>
-    /// <param name="data">The training data.</param>
-	/// <param name="progress">The progress.</param>
-    /// <returns>A new decision forest.</returns>
-    static std::auto_ptr<Forest<F,S> > TrainForest(
-      Random& random,
-      const TrainingParameters& parameters,
-      ITrainingContext<F,S>& context,
-      const IDataPointCollection& data,
-      ProgressStream* progress=0)
-    {
-      ProgressStream defaultProgress(std::cout, parameters.Verbose? Verbose:Interest);
-      if(progress==0)
-        progress=&defaultProgress;
-
-      std::auto_ptr<Forest<F,S> > forest = std::auto_ptr<Forest<F,S> >(new Forest<F,S>());
-
-      for (int t = 0; t < parameters.NumberOfTrees; t++)
-      {
-        (*progress)[Interest] << "\rTraining tree "<< t << "...";
-
-        std::auto_ptr<Tree<F, S> > tree = ParallelTreeTrainer<F, S>::TrainTree(random, context, parameters, data, progress);
-        forest->AddTree(tree);
-      }
-      (*progress)[Interest] << "\rTrained " << parameters.NumberOfTrees << " trees.         " << std::endl;
-
-      return forest;
-    }
-  };
-} } }
+// This file defines the ParallelForestTrainer and ParallelTreeTraininer classes,
+// which are responsible for creating new Tree instances by learning from
+// training data. These classes have almost identical interfaces to ForestTrainer
+// and TreeTrainer, but allow candidate feature evaluation to be shared over a
+// specified maximum number of threads.
+// Bug fixing and switching to PPL by Sergey Kosov in 2016 for Project X
+#pragma once
+
+#include <assert.h>
+#include <ppl.h>
+
+#include <vector>
+#include <string>
+#include <algorithm>
+
+#include "ProgressStream.h"
+
+#include "TrainingParameters.h"
+#include "Interfaces.h"
+#include "Tree.h"
+
+namespace MicrosoftResearch { namespace Cambridge { namespace Sherwood
+{
+	class Random;
+
+	/**
+	* @brief A decision tree training operation in which candidate feature responsefunction evaluation is distributed over multiple threads -
+	* used internally within ParallelTreeTrainer to encapsulate the training a single tree.
+	* @tparam F IFeatureResponse
+	* @tparam S IStatisticsAggregator
+	*/
+	template<class F, class S> class ParallelTreeTrainingOperation 
+	{
+	private:
+		typedef typename std::vector<Node<F, S> >::size_type NodeIndex;
+		typedef typename std::vector<size_t>::size_type DataPointIndex;
+
+		Random						& random_;
+		const IDataPointCollection	& data_;
+		ITrainingContext<F, S>		& trainingContext_;
+		TrainingParameters			  parameters_;
+		S							  parentStatistics_, leftChildStatistics_, rightChildStatistics_;
+		std::vector<S>				  partitionStatistics_;
+		std::vector<float>			  responses_;
+		std::vector<size_t>			  indices_;
+		ProgressStream				  progress_;
+		int							  maxThreads_;
+
+		class ThreadLocalData
+		{
+		public:
+			double				maxGain;
+			F					bestFeature;
+			float				bestThreshold;
+
+			S					parentStatistics_, leftChildStatistics_, rightChildStatistics_;
+			std::vector<S>		partitionStatistics_;
+			std::vector<float>	responses_;
+			std::vector<float>	thresholds;
+			Random				random_;
+
+			ThreadLocalData(void) { }
+
+			ThreadLocalData(Random& random, ITrainingContext<F, S> &trainingContext_, const TrainingParameters &parameters, IDataPointCollection const &data) : random_(random.Next())
+			{
+				maxGain = 0.0;
+				bestThreshold = 0.0;
+				parentStatistics_ = trainingContext_.GetStatisticsAggregator();
+
+				leftChildStatistics_ = trainingContext_.GetStatisticsAggregator();
+				rightChildStatistics_ = trainingContext_.GetStatisticsAggregator();
+
+				partitionStatistics_.resize(parameters.NumberOfCandidateThresholdsPerFeature + 1);
+				for (unsigned int i = 0; i < parameters.NumberOfCandidateThresholdsPerFeature + 1; i++)
+					partitionStatistics_[i] = trainingContext_.GetStatisticsAggregator();
+
+				responses_.resize(data.Count());
+				// thresholds_ will be resized() in ChooseCandidateThresholds()
+			}
+
+			void Clear(void)
+			{
+				maxGain = 0.0;
+				bestFeature = F();
+				bestThreshold = 0.0f;
+			}
+		};
+
+		std::vector<ThreadLocalData> threadLocalData_;
+
+	public:
+		// Constructor
+		ParallelTreeTrainingOperation(Random &random, ITrainingContext<F, S> &trainingContext, const TrainingParameters &parameters, int maxThreads, const IDataPointCollection  &data, ProgressStream &progress)
+			: random_(random)
+			, data_(data)
+			, maxThreads_(maxThreads)
+			, trainingContext_(trainingContext)
+			, progress_(progress)
+		{
+			parameters_ = parameters;
+
+			indices_.resize(data.Count());
+			for (DataPointIndex i = 0; i < indices_.size(); i++) indices_[i] = i;
+
+			responses_.resize(data.Count());
+
+			parentStatistics_ = trainingContext_.GetStatisticsAggregator();
+
+			//leftChildStatistics_ = trainingContext_.GetStatisticsAggregator();
+			//rightChildStatistics_ = trainingContext_.GetStatisticsAggregator();
+
+			//partitionStatistics_.resize(parameters.NumberOfCandidateThresholdsPerFeature + 1);
+			//for (unsigned int i = 0; i < parameters.NumberOfCandidateThresholdsPerFeature + 1; i++)
+			//	partitionStatistics_[i] = trainingContext_.GetStatisticsAggregator();
+					
+			threadLocalData_.resize(maxThreads_);
+			for (int threadIndex = 0; threadIndex < maxThreads_; threadIndex++)
+				// Note use of placement new operator to initialize already-allocated memory
+				new (&threadLocalData_[threadIndex]) ThreadLocalData(random, trainingContext_, parameters_, data_);
+		}
+
+		void TrainNodesRecurse(std::vector<Node<F, S> >& nodes, NodeIndex nodeIndex, DataPointIndex i0, DataPointIndex i1, int recurseDepth)
+		{
+			assert(nodeIndex < nodes.size());
+			progress_[Verbose] << Tree<F, S>::GetPrettyPrintPrefix((int)nodeIndex) << i1 - i0 << ": ";
+
+			// First aggregate statistics over the samples at the parent node
+			parentStatistics_.Clear();
+			for (DataPointIndex i = i0; i < i1; i++) parentStatistics_.Aggregate(data_, indices_[i]);
+
+			// Copy parent statistics to thread local storage in case client IStatisticsAggregator implementations are not reentrant
+			for (int t = 0; t < maxThreads_; t++) threadLocalData_[t].parentStatistics_ = parentStatistics_.DeepClone();
+					
+			if (nodeIndex >= nodes.size() / 2) { // this is a leaf node, nothing else to do
+				nodes[nodeIndex].InitializeLeaf(parentStatistics_);
+				progress_[Verbose] << "Terminating at max depth." << std::endl;
+				return;
+			}
+
+					
+			//for (int threadIndex = 0; threadIndex < maxThreads_; threadIndex++) {
+			concurrency::parallel_for(0, maxThreads_, [&](int threadIndex) {
+				ThreadLocalData &tl = threadLocalData_[threadIndex]; // shorthand
+
+				tl.Clear();
+
+				// Iterate over candidate features
+				std::vector<float> thresholds;
+				int fMax = std::ceil((double) parameters_.NumberOfCandidateFeatures / maxThreads_);
+				for (int f = 0; f < fMax; f++) {
+					F feature = trainingContext_.GetRandomFeature(tl.random_);
+
+					// reset statistics
+					for (unsigned int b = 0; b < parameters_.NumberOfCandidateThresholdsPerFeature + 1; b++) tl.partitionStatistics_[b].Clear();
+
+					// Compute feature response per samples at this node
+					for (DataPointIndex i = i0; i < i1; i++) tl.responses_[i] = feature.GetResponse(data_, indices_[i]);
+
+					int nThresholds;
+					if ((nThresholds = ChooseCandidateThresholds(tl.random_, &indices_[0], i0, i1, &tl.responses_[0], tl.thresholds)) == 0) continue;
+
+					// Aggregate statistics over sample partitions
+					for (DataPointIndex i = i0; i < i1; i++) {
+						int b = 0;
+						while (b < nThresholds && tl.responses_[i] >= tl.thresholds[b]) b++;
+						tl.partitionStatistics_[b].Aggregate(data_, indices_[i]);
+					}
+
+					for (int t = 0; t < nThresholds; t++) {
+						tl.leftChildStatistics_.Clear();
+						tl.rightChildStatistics_.Clear();
+						for (int p = 0; p < nThresholds + 1 /*i.e. nBins*/; p++) {
+							if (p <= t) tl.leftChildStatistics_.Aggregate(tl.partitionStatistics_[p]);
+							else		tl.rightChildStatistics_.Aggregate(tl.partitionStatistics_[p]);
+						}
+
+						// Compute gain over sample partitions
+						double gain = trainingContext_.ComputeInformationGain(tl.parentStatistics_, tl.leftChildStatistics_, tl.rightChildStatistics_);
+
+						if (gain >= tl.maxGain) {
+							tl.maxGain = gain;
+							tl.bestFeature = feature;
+							tl.bestThreshold = tl.thresholds[t];
+						}
+					} // t
+				} // f
+			}); // threadIndex
+					
+
+			// Now merge over threads.
+			double	maxGain = 0.0;
+			F		bestFeature;
+			float	bestThreshold = 0.0;
+
+			for (int threadIndex = 0; threadIndex < maxThreads_; threadIndex++) {
+				ThreadLocalData &tl = threadLocalData_[threadIndex];
+				if (tl.maxGain > maxGain) {
+					maxGain = tl.maxGain;
+					bestFeature = tl.bestFeature;
+					bestThreshold = tl.bestThreshold;
+				}
+			}
+
+					
+			if (maxGain == 0.0) {
+				nodes[nodeIndex].InitializeLeaf(parentStatistics_);
+				progress_[Verbose] << "Terminating with zero gain." << std::endl;
+				return;
+			}
+
+			// Now reorder the data point indices using the winning feature and thresholds.
+			// Also recompute child node statistics so the client can decide whether
+			// to terminate training of this branch.
+			leftChildStatistics_.Clear();
+			rightChildStatistics_.Clear();
+
+			for (DataPointIndex i = i0; i < i1; i++) {
+				responses_[i] = bestFeature.GetResponse(data_, indices_[i]);
+				if (responses_[i] < bestThreshold)	leftChildStatistics_.Aggregate(data_, indices_[i]);
+				else								rightChildStatistics_.Aggregate(data_, indices_[i]);
+			}
+
+			if (trainingContext_.ShouldTerminate(parentStatistics_, leftChildStatistics_, rightChildStatistics_, maxGain)) {
+				nodes[nodeIndex].InitializeLeaf(parentStatistics_);
+				progress_[Verbose] << "Terminating with no split." << std::endl;
+				return;
+			}
+
+			// Otherwise this is a new decision node, recurse for children.
+			nodes[nodeIndex].InitializeSplit(bestFeature, bestThreshold, parentStatistics_);
+
+			// Now do partition sort - any sample with response greater goes left, otherwise right
+			DataPointIndex ii = Tree<F, S>::Partition(responses_, indices_, i0, i1, bestThreshold);
+
+			assert(ii >= i0 && i1 >= ii);
+
+			progress_[Verbose] << " (threshold = " << bestThreshold << ", gain = " << maxGain << ")." << std::endl;
+
+			TrainNodesRecurse(nodes, nodeIndex * 2 + 1, i0, ii, recurseDepth + 1);
+			TrainNodesRecurse(nodes, nodeIndex * 2 + 2, ii, i1, recurseDepth + 1);
+		}
+
+
+	private:
+		int ChooseCandidateThresholds(Random &random, size_t *dataIndices, DataPointIndex i0, DataPointIndex i1, const float *responses, std::vector<float> &thresholds)
+		{
+			thresholds.resize(parameters_.NumberOfCandidateThresholdsPerFeature + 1);
+			std::vector<float>& quantiles = thresholds; // shorthand, for code clarity - we reuse memory to avoid allocation
+
+			int nThresholds;
+			// If there are enough response values...
+			if (i1 - i0 > parameters_.NumberOfCandidateThresholdsPerFeature) {
+				// ...make a random draw of NumberOfCandidateThresholdsPerFeature+1 response values
+				nThresholds = parameters_.NumberOfCandidateThresholdsPerFeature;
+				for (int i = 0; i < nThresholds + 1; i++)
+					quantiles[i] = responses[(int)random.Next((int)i0, (int)i1)]; // sample randomly from all responses
+			}
+			else {
+				// ...otherwise use all response values.
+				nThresholds = (int)i1 - (int)i0 - 1;
+				std::copy(&responses[i0], &responses[i1], quantiles.begin());
+			}
+
+			// Sort the response values to form approximate quantiles.
+			std::sort(quantiles.begin(), quantiles.end());
+
+			if (quantiles[0] == quantiles[nThresholds]) return 0;   // all sampled response values were the same
+
+																	// Compute n candidate thresholds by sampling in between n+1 approximate quantiles
+			for (int i = 0; i < nThresholds; i++) thresholds[i] = quantiles[i] + (float)(random_.NextDouble() * (quantiles[i + 1] - quantiles[i]));
+
+			return nThresholds;
+		}
+	};
+
+
+	/**
+	* @brief Used for multi-threaded decision tree training. Candidate feature response function evaluation is distributed over multiple threads.
+	* @tparam F IFeatureResponse
+	* @tparam S IStatisticsAggregator
+	*/
+	template<class F, class S> class ParallelTreeTrainer
+	{
+	public:
+		/**
+		* @brief Train a new decision tree given some training data and a training problem described by an ITrainingContext instance.
+		* @param random The single random number generator.
+		* @param progress Progress reporting target.
+		* @param context The %ITrainingContext instance by which the training framework interacts with the training data. Implemented within client code.
+		* @param parameters Training parameters.
+		* @param maxThreads The maximum number of threads to use.
+		* @param data The training data.
+		* @returns A new decision tree.
+		*/
+		static std::auto_ptr<Tree<F, S> > TrainTree(Random &random, ITrainingContext<F, S> &context, const TrainingParameters &parameters, int maxThreads, const IDataPointCollection &data, ProgressStream *progress = nullptr)
+		{
+			ProgressStream defaultProgress(std::cout, parameters.Verbose ? Verbose : Interest);
+			if (progress == 0) progress = &defaultProgress;
+
+			ParallelTreeTrainingOperation<F, S> trainingOperation(random, context, parameters, maxThreads, data, *progress);
+
+			std::auto_ptr<Tree<F, S> > tree = std::auto_ptr<Tree<F, S>>(new Tree<F, S>(parameters.MaxDecisionLevels));
+
+			(*progress)[Verbose] << std::endl;
+
+			trainingOperation.TrainNodesRecurse(tree->GetNodes(), 0, 0, data.Count(), 0);  // will recurse until termination criterion is met
+
+			(*progress)[Verbose] << std::endl;
+
+			tree->CheckValid();
+
+			return tree;
+		}
+	};
+
+	/**
+	* @brief Learns new decision forests from training data.
+	* @tparam F IFeatureResponse
+	* @tparam S IStatisticsAggregator
+	*/
+	template<class F, class S> class ParallelForestTrainer 
+	{
+	public:
+		/**
+		* @brief Train a new decision forest given some training data and a training problem described by an instance of the ITrainingContext interface.
+		* @param random %Random number generator.
+		* @param parameters Training parameters.
+		* @param context An %ITrainingContext instance describing the training problem, e.g. classification, density estimation, etc.
+		* @param data The training data.
+		* @param progress The progress.
+		* @returns A new decision forest.
+		*/
+		static std::auto_ptr<Forest<F, S>> TrainForest(Random &random, const TrainingParameters &parameters, ITrainingContext<F, S> &context, const IDataPointCollection &data, ProgressStream *progress = nullptr)
+		{
+			ProgressStream defaultProgress(std::cout, parameters.Verbose ? Verbose : Interest);
+			if (progress == 0) progress = &defaultProgress;
+
+			std::auto_ptr<Forest<F, S> > forest = std::auto_ptr<Forest<F, S>>(new Forest<F, S>());
+
+					
+			int nCores = MAX(1, std::thread::hardware_concurrency());
+			for (int t = 0; t < parameters.NumberOfTrees; t++) {
+				(*progress)[Interest] << "\rTraining tree " << t << "...";
+
+				std::auto_ptr<Tree<F, S> > tree = ParallelTreeTrainer<F, S>::TrainTree(random, context, parameters, nCores, data, progress);
+				forest->AddTree(tree);
+			}
+
+			(*progress)[Interest] << "\rTrained " << parameters.NumberOfTrees << " trees.         " << std::endl;
+
+			return forest;
+		}
+	};
+} } }
diff --git a/Demo Train/Demo Train.vcxproj.user b/Demo Train/Demo Train.vcxproj.user
index 7b2d9e8e..0599eef5 100644
--- a/Demo Train/Demo Train.vcxproj.user	
+++ b/Demo Train/Demo Train.vcxproj.user	
@@ -13,7 +13,7 @@
     <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LocalDebuggerCommandArguments> 0 4 ..\Data\001_img.jpg ..\Data\001_fv.jpg ..\Data\001_gt.bmp ..\Data\001_output.jpg </LocalDebuggerCommandArguments>
+    <LocalDebuggerCommandArguments> 6 1 ..\Data\001_img.jpg ..\Data\001_fv.jpg ..\Data\001_gt.bmp ..\Data\001_output.jpg </LocalDebuggerCommandArguments>
     <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
   </PropertyGroup>
 </Project>
\ No newline at end of file
diff --git a/Demo Train/main.cpp b/Demo Train/main.cpp
index 03eee18f..27c49d7d 100644
--- a/Demo Train/main.cpp	
+++ b/Demo Train/main.cpp	
@@ -47,7 +47,7 @@ int main(int argv, char *argc[])
 	CTrainNode		* nodeTrainer	= NULL; 
 	CTrainEdge		* edgeTrainer	= NULL;
 	CGraphExt		* graph			= new CGraphExt(nStates);
-	CInfer			* decoder		= new CInferTRW(graph);
+	CInfer			* decoder		= new CInferLBP(graph);
 	CMarker			* marker		= new CMarker(DEF_PALETTE_6);
 	CCMat			* confMat		= new CCMat(nStates);
 	float			  params[]		= {100, 0.01f};						
diff --git a/doc/Doxyfile b/doc/Doxyfile
index b8c13381..85118cc8 100644
--- a/doc/Doxyfile
+++ b/doc/Doxyfile
@@ -2046,7 +2046,8 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = DllExport
+PREDEFINED             = DllExport \
+                         USE_SHERWOOD
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/include/types.h b/include/types.h
index 1472029f..c094fb49 100644
--- a/include/types.h
+++ b/include/types.h
@@ -1,9 +1,10 @@
 #pragma once
 
-#define PRINT_DEBUG_INFO	
-#define DEBUG_MODE			
+//#define PRINT_DEBUG_INFO	
+//#define DEBUG_MODE			
 #define USE_PPL
 //#define USE_AMP
+#define USE_SHERWOOD
 
 #include <vector>
 #include <memory>