spotify · Tarrasch · Aug 9, 2017 · May 21, 2017 · Aug 2, 2017 · Aug 2, 2017
@@ -237,8 +237,6 @@ class PySparkTask(SparkSubmitTask):
 
     # Path to the pyspark program passed to spark-submit
     app = os.path.join(os.path.dirname(__file__), 'pyspark_runner.py')
-    # Python only supports the client deploy mode, force it
-    deploy_mode = "client"
 
     @property
     def name(self):
@@ -250,6 +248,11 @@ def py_packages(self):
         if packages:
             return map(lambda s: s.strip(), packages.split(','))
 
+    @property
+    def files(self):
+        if self.deploy_mode == "cluster":
+            return [self.run_pickle]
+
     def setup(self, conf):
         """
         Called by the pyspark_runner with a SparkConf instance that will be used to instantiate the SparkContext
@@ -269,11 +272,12 @@ def main(self, sc, *args):
         """
         raise NotImplementedError("subclass should define a main method")
 
-    def program_args(self):
-        return self.spark_command() + self.app_command()
-
     def app_command(self):
-        return [self.app, self.run_pickle] + self.app_options()
+        if self.deploy_mode == "cluster":
+            pickle_loc = os.path.basename(self.run_pickle)
+        else:
+            pickle_loc = self.run_pickle
+        return [self.app, pickle_loc] + self.app_options()
 
     def run(self):
         self.run_path = tempfile.mkdtemp(prefix=self.name)

@@ -188,7 +188,7 @@ def interrupt():
 class PySparkTaskTest(unittest.TestCase):
     ss = 'ss-stub'
 
-    @with_config({'spark': {'spark-submit': ss, 'master': "spark://host:7077"}})
+    @with_config({'spark': {'spark-submit': ss, 'master': "spark://host:7077", 'deploy-mode': 'client'}})
     @patch('luigi.contrib.external_program.subprocess.Popen')
     def test_run(self, proc):
         setup_run_process(proc)
@@ -199,7 +199,7 @@ def test_run(self, proc):
         self.assertTrue(os.path.exists(proc_arg_list[7]))
         self.assertTrue(proc_arg_list[8].endswith('TestPySparkTask.pickle'))
 
-    @with_config({'spark': {'spark-submit': ss, 'master': "spark://host:7077"}})
+    @with_config({'spark': {'spark-submit': ss, 'master': "spark://host:7077", 'deploy-mode': 'client'}})
     @patch('luigi.contrib.external_program.subprocess.Popen')
     def test_run_with_pickle_dump(self, proc):
         setup_run_process(proc)
@@ -211,6 +211,17 @@ def test_run_with_pickle_dump(self, proc):
         self.assertTrue(os.path.exists(proc_arg_list[7]))
         self.assertTrue(proc_arg_list[8].endswith('TestPySparkTask.pickle'))
 
+    @with_config({'spark': {'spark-submit': ss, 'master': "spark://host:7077", 'deploy-mode': 'cluster'}})
+    @patch('luigi.contrib.external_program.subprocess.Popen')
+    def test_run_with_cluster(self, proc):
+        setup_run_process(proc)
+        job = TestPySparkTask()
+        job.run()
+        proc_arg_list = proc.call_args[0][0]
+        self.assertEqual(proc_arg_list[0:7], ['ss-stub', '--master', 'spark://host:7077', '--deploy-mode', 'cluster', '--name', 'TestPySparkTask'])
+        self.assertTrue(os.path.exists(proc_arg_list[7]))
+        self.assertTrue(proc_arg_list[8].endswith('TestPySparkTask.pickle'))
+
     @patch.dict('sys.modules', {'pyspark': MagicMock()})
     @patch('pyspark.SparkContext')
     def test_pyspark_runner(self, spark_context):