From d81fb6a9e6f3650d46e53a061cf9e5ede0c119a5 Mon Sep 17 00:00:00 2001
From: chenshuaihua <albertcool@126.com>
Date: Sun, 11 Jan 2015 21:59:38 +0800
Subject: [PATCH 1/7] test

---
 tracker/rabit_yarn.py | 89 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 tracker/rabit_yarn.py

diff --git a/tracker/rabit_yarn.py b/tracker/rabit_yarn.py
new file mode 100644
index 00000000..2f896b2a
--- /dev/null
+++ b/tracker/rabit_yarn.py
@@ -0,0 +1,89 @@
+#!/usr/bin/python
+"""
+This is a script to submit rabit job using hadoop streaming
+submit the rabit process as mappers of MapReduce
+"""
+import argparse
+import sys
+import os
+import time
+import subprocess
+import rabit_tracker as tracker
+
+#!!! Set path to hadoop and hadoop streaming jar here
+hadoop_binary = 'hadoop'
+hadoop_streaming_jar = None
+
+# code 
+hadoop_home = os.getenv('HADOOP_HOME')
+if hadoop_home != None:
+    if hadoop_binary == None:
+        hadoop_binary = hadoop_home + '/bin/hadoop'
+        assert os.path.exists(hadoop_binary), "HADDOP_HOME does not contain the hadoop binary"
+    if hadoop_streaming_jar == None:
+		hadoop_streaming_jar = hadoop_home + '/lib/hadoop-streaming.jar'
+        assert os.path.exists(hadoop_streaming_jar), "HADDOP_HOME does not contain the haddop streaming jar"
+
+if hadoop_binary == None or hadoop_streaming_jar == None:
+    print 'Warning: Cannot auto-detect path to hadoop and streaming jar, need to set them via arguments -hs and -hb'
+    print '\tTo enable auto-detection, you can set enviroment variable HADOOP_HOME or modify rabit_hadoop.py line 14'
+
+parser = argparse.ArgumentParser(description='Rabit script to submit rabit jobs using Hadoop Streaming')
+parser.add_argument('-nw', '--nworker', required=True, type=int,
+                    help = 'number of worker proccess to be launched')
+parser.add_argument('-nt', '--nthread', required=True, type=int,
+		                    help = 'number of thread of each mapper to be launched')
+parser.add_argument('-i', '--input', required=True,
+                    help = 'input path in HDFS')
+parser.add_argument('-o', '--output', required=True,
+                    help = 'output path in HDFS')
+parser.add_argument('-v', '--verbose', default=0, choices=[0, 1], type=int,
+                    help = 'print more messages into the console')
+parser.add_argument('-ac', '--auto_file_cache', default=1, choices=[0, 1], type=int,
+                    help = 'whether automatically cache the files in the command to hadoop localfile, this is on by default')
+parser.add_argument('-f', '--files', nargs = '*',
+                    help = 'the cached file list in mapreduce,'\
+                        ' the submission script will automatically cache all the files which appears in command.'\
+                        ' You may need this option to cache additional files.'\
+                        ' You can also use it to manually cache files when auto_file_cache is off')
+parser.add_argument('--jobname', help = 'customize jobname in tracker')
+if hadoop_binary == None:
+    parser.add_argument('-hb', '--hadoop_binary', required = True,
+                        help="path-to-hadoop binary folder")  
+else:
+    parser.add_argument('-hb', '--hadoop_binary', default = hadoop_binary, 
+                        help="path-to-hadoop binary folder")  
+
+if hadoop_streaming_jar == None:
+    parser.add_argument('-jar', '--hadoop_streaming_jar', required = True,
+                        help='path-to hadoop streamimg jar file')
+else:
+    parser.add_argument('-jar', '--hadoop_streaming_jar', default = hadoop_streaming_jar,
+                        help='path-to hadoop streamimg jar file')
+parser.add_argument('command', nargs='+',
+                    help = 'command for rabit program')
+args = parser.parse_args()
+
+if args.jobname is None:
+    args.jobname = ('Rabit(nworker=%d):' % args.nworker) + args.command[0].split('/')[-1];
+
+def hadoop_streaming(nworker, slave_args):
+    cmd = '%s jar %s -D mapreduce.job.maps=%d' % (args.hadoop_binary, args.hadoop_streaming_jar, nworker)
+    cmd += ' -D mapreduce.job.name=%s' % (args.jobname)
+    cmd += ' -D mapreduce.map.cpu.vcores=%d' % (args.nthread)
+    cmd += ' -input %s -output %s' % (args.input, args.output)
+    cmd += ' -mapper \"%s\" -reducer \"/bin/cat\" ' % (' '.join(args.command + slave_args))
+    fset = set()
+    if args.auto_file_cache:
+        for f in args.command:
+            if os.path.exists(f):
+                fset.add(f)
+    for flst in args.files:
+        for f in flst.split('#'):
+            fset.add(f)
+    for f in fset:
+        cmd += ' -file %s' % f
+    print cmd
+    subprocess.check_call(cmd, shell = True)
+
+tracker.submit(args.nworker, [], fun_submit = hadoop_streaming, verbose = args.verbose)

From b5ac85f103033a02802a90faf8361d700931dcc9 Mon Sep 17 00:00:00 2001
From: chenshuaihua <albertcool@126.com>
Date: Sun, 11 Jan 2015 23:19:04 +0800
Subject: [PATCH 2/7] yarn script

---
 tracker/rabit_yarn.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tracker/rabit_yarn.py b/tracker/rabit_yarn.py
index 2f896b2a..9922b55d 100644
--- a/tracker/rabit_yarn.py
+++ b/tracker/rabit_yarn.py
@@ -46,7 +46,14 @@
                         ' the submission script will automatically cache all the files which appears in command.'\
                         ' You may need this option to cache additional files.'\
                         ' You can also use it to manually cache files when auto_file_cache is off')
-parser.add_argument('--jobname', help = 'customize jobname in tracker')
+parser.add_argument('--jobname', default='auto', help = 'customize jobname in tracker')
+parser.add_argument('--timeout', default=600000000, type=int,
+                    help = 'timeout (in million seconds) of each mapper job, automatically set to a very long time,'\
+                       'normally you do not need to set this ')
+parser.add_argument('-m', '--memory_mb', default=1024, type=int,
+                    help = 'maximum memory used by the process, Guide: set it large (near mapreduce.jobtracker.maxmapmemory.mb).'\
+                    'if you are running multi-threading rabit,'\
+                    'so that each node can occupy all the mapper slots in a machine for maximum performance')
 if hadoop_binary == None:
     parser.add_argument('-hb', '--hadoop_binary', required = True,
                         help="path-to-hadoop binary folder")  
@@ -71,6 +78,8 @@ def hadoop_streaming(nworker, slave_args):
     cmd = '%s jar %s -D mapreduce.job.maps=%d' % (args.hadoop_binary, args.hadoop_streaming_jar, nworker)
     cmd += ' -D mapreduce.job.name=%s' % (args.jobname)
     cmd += ' -D mapreduce.map.cpu.vcores=%d' % (args.nthread)
+	cmd += ' -D mapreduce.task.timeout=%d' % (args.timeout)
+    cmd += ' -D mapreduce.map.memory.mb=%d' % (args.memory_mb)
     cmd += ' -input %s -output %s' % (args.input, args.output)
     cmd += ' -mapper \"%s\" -reducer \"/bin/cat\" ' % (' '.join(args.command + slave_args))
     fset = set()

From 5e843cfbbd58a9d77d837d6f5711b2a30a9fa89a Mon Sep 17 00:00:00 2001
From: chenshuaihua <albertcool@126.com>
Date: Sun, 11 Jan 2015 23:22:26 +0800
Subject: [PATCH 3/7] yarn script

---
 tracker/rabit_yarn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tracker/rabit_yarn.py b/tracker/rabit_yarn.py
index 9922b55d..76af354e 100644
--- a/tracker/rabit_yarn.py
+++ b/tracker/rabit_yarn.py
@@ -78,8 +78,8 @@ def hadoop_streaming(nworker, slave_args):
     cmd = '%s jar %s -D mapreduce.job.maps=%d' % (args.hadoop_binary, args.hadoop_streaming_jar, nworker)
     cmd += ' -D mapreduce.job.name=%s' % (args.jobname)
     cmd += ' -D mapreduce.map.cpu.vcores=%d' % (args.nthread)
-	cmd += ' -D mapreduce.task.timeout=%d' % (args.timeout)
     cmd += ' -D mapreduce.map.memory.mb=%d' % (args.memory_mb)
+	cmd += ' -D mapreduce.task.timeout=%d' % (args.timeout)
     cmd += ' -input %s -output %s' % (args.input, args.output)
     cmd += ' -mapper \"%s\" -reducer \"/bin/cat\" ' % (' '.join(args.command + slave_args))
     fset = set()

From 981f69ff55b3f13bd3f65e32dfa2fd4859017b29 Mon Sep 17 00:00:00 2001
From: chenshuaihua <albertcool@126.com>
Date: Sun, 11 Jan 2015 23:23:58 +0800
Subject: [PATCH 4/7] yarn script

---
 tracker/rabit_yarn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tracker/rabit_yarn.py b/tracker/rabit_yarn.py
index 76af354e..bc794ba1 100644
--- a/tracker/rabit_yarn.py
+++ b/tracker/rabit_yarn.py
@@ -79,7 +79,7 @@ def hadoop_streaming(nworker, slave_args):
     cmd += ' -D mapreduce.job.name=%s' % (args.jobname)
     cmd += ' -D mapreduce.map.cpu.vcores=%d' % (args.nthread)
     cmd += ' -D mapreduce.map.memory.mb=%d' % (args.memory_mb)
-	cmd += ' -D mapreduce.task.timeout=%d' % (args.timeout)
+cmd += ' -D mapreduce.task.timeout=%d' % (args.timeout)
     cmd += ' -input %s -output %s' % (args.input, args.output)
     cmd += ' -mapper \"%s\" -reducer \"/bin/cat\" ' % (' '.join(args.command + slave_args))
     fset = set()

From 00323f462aaf7156805ba4b40272d9e36afc7eca Mon Sep 17 00:00:00 2001
From: chenshuaihua <albertcool@126.com>
Date: Sun, 11 Jan 2015 23:32:14 +0800
Subject: [PATCH 5/7] yarn script

---
 tracker/rabit_yarn.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tracker/rabit_yarn.py b/tracker/rabit_yarn.py
index bc794ba1..3ba8ddfc 100644
--- a/tracker/rabit_yarn.py
+++ b/tracker/rabit_yarn.py
@@ -74,12 +74,14 @@
 if args.jobname is None:
     args.jobname = ('Rabit(nworker=%d):' % args.nworker) + args.command[0].split('/')[-1];
 
+	 
+	
 def hadoop_streaming(nworker, slave_args):
     cmd = '%s jar %s -D mapreduce.job.maps=%d' % (args.hadoop_binary, args.hadoop_streaming_jar, nworker)
-    cmd += ' -D mapreduce.job.name=%s' % (args.jobname)
-    cmd += ' -D mapreduce.map.cpu.vcores=%d' % (args.nthread)
+    cmd += ' -D mapreduce.job.name' % (args.jobname)
+    cmd += ' -D mapreduce.task.timeout=%d' % (args.timeout)
     cmd += ' -D mapreduce.map.memory.mb=%d' % (args.memory_mb)
-cmd += ' -D mapreduce.task.timeout=%d' % (args.timeout)
+    cmd += ' -D mapreduce.map.cpu.vcores=%d' % (args.nthread)
     cmd += ' -input %s -output %s' % (args.input, args.output)
     cmd += ' -mapper \"%s\" -reducer \"/bin/cat\" ' % (' '.join(args.command + slave_args))
     fset = set()

From 26b5fdac40aae50a4e9a38a5fffc6c80da6553cd Mon Sep 17 00:00:00 2001
From: chenshuaihua <albertcool@126.com>
Date: Sun, 11 Jan 2015 23:54:31 +0800
Subject: [PATCH 6/7] yarn script

---
 tracker/rabit_yarn.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tracker/rabit_yarn.py b/tracker/rabit_yarn.py
index 3ba8ddfc..b5bd5367 100644
--- a/tracker/rabit_yarn.py
+++ b/tracker/rabit_yarn.py
@@ -71,10 +71,9 @@
                     help = 'command for rabit program')
 args = parser.parse_args()
 
-if args.jobname is None:
-    args.jobname = ('Rabit(nworker=%d):' % args.nworker) + args.command[0].split('/')[-1];
-
-	 
+if args.jobname == 'auto':
+        args.jobname = ('Rabit[nworker=%d]:' % args.nworker) + args.command[0].split('/')[-1];
+ 
 	
 def hadoop_streaming(nworker, slave_args):
     cmd = '%s jar %s -D mapreduce.job.maps=%d' % (args.hadoop_binary, args.hadoop_streaming_jar, nworker)

From b2dec958621cf5384f67f467604c1dcf1cfe1fe6 Mon Sep 17 00:00:00 2001
From: chenshuaihua <albertcool@126.com>
Date: Mon, 12 Jan 2015 00:09:00 +0800
Subject: [PATCH 7/7] yarn script

---
 tracker/rabit_yarn.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/tracker/rabit_yarn.py b/tracker/rabit_yarn.py
index b5bd5367..0b542c09 100644
--- a/tracker/rabit_yarn.py
+++ b/tracker/rabit_yarn.py
@@ -21,7 +21,7 @@
         hadoop_binary = hadoop_home + '/bin/hadoop'
         assert os.path.exists(hadoop_binary), "HADDOP_HOME does not contain the hadoop binary"
     if hadoop_streaming_jar == None:
-		hadoop_streaming_jar = hadoop_home + '/lib/hadoop-streaming.jar'
+	hadoop_streaming_jar = hadoop_home + '/lib/hadoop-streaming.jar'
         assert os.path.exists(hadoop_streaming_jar), "HADDOP_HOME does not contain the haddop streaming jar"
 
 if hadoop_binary == None or hadoop_streaming_jar == None:
@@ -48,12 +48,14 @@
                         ' You can also use it to manually cache files when auto_file_cache is off')
 parser.add_argument('--jobname', default='auto', help = 'customize jobname in tracker')
 parser.add_argument('--timeout', default=600000000, type=int,
-                    help = 'timeout (in million seconds) of each mapper job, automatically set to a very long time,'\
-                       'normally you do not need to set this ')
+		    help = 'timeout (in million seconds) of each mapper job, automatically set to a very long time,'\
+		       'normally you do not need to set this ')
 parser.add_argument('-m', '--memory_mb', default=1024, type=int,
-                    help = 'maximum memory used by the process, Guide: set it large (near mapreduce.jobtracker.maxmapmemory.mb).'\
-                    'if you are running multi-threading rabit,'\
-                    'so that each node can occupy all the mapper slots in a machine for maximum performance')
+		    help = 'maximum memory used by the process, Guide: set it large (near mapreduce.jobtracker.maxmapmemory.mb).'\
+		    'if you are running multi-threading rabit,'\
+		    'so that each node can occupy all the mapper slots in a machine for maximum performance')
+
+
 if hadoop_binary == None:
     parser.add_argument('-hb', '--hadoop_binary', required = True,
                         help="path-to-hadoop binary folder")  
@@ -72,15 +74,14 @@
 args = parser.parse_args()
 
 if args.jobname == 'auto':
-        args.jobname = ('Rabit[nworker=%d]:' % args.nworker) + args.command[0].split('/')[-1];
- 
-	
+	args.jobname = ('Rabit[nworker=%d]:' % args.nworker) + args.command[0].split('/')[-1];
+
 def hadoop_streaming(nworker, slave_args):
     cmd = '%s jar %s -D mapreduce.job.maps=%d' % (args.hadoop_binary, args.hadoop_streaming_jar, nworker)
-    cmd += ' -D mapreduce.job.name' % (args.jobname)
+    cmd += ' -D mapreduce.job.name=%s' % (args.jobname)
+    cmd += ' -D mapreduce.map.cpu.vcores=%d' % (args.nthread)
     cmd += ' -D mapreduce.task.timeout=%d' % (args.timeout)
     cmd += ' -D mapreduce.map.memory.mb=%d' % (args.memory_mb)
-    cmd += ' -D mapreduce.map.cpu.vcores=%d' % (args.nthread)
     cmd += ' -input %s -output %s' % (args.input, args.output)
     cmd += ' -mapper \"%s\" -reducer \"/bin/cat\" ' % (' '.join(args.command + slave_args))
     fset = set()