Closes #1426: Run IIS experiments by relying on spark 3.4 version

WIP. Introducing required workflow.xml fixes for various workflows relying on spark3 to let their integration tests to succeed: * setting `spark.extraListeners` and `spark.sql.queryExecutionListeners` explicitly to empty values in order to avoid relying on incompatible, spark2 compliant, cloudera listeners * setting `spark.shuffle.useOldFetchProtocol=true` in order to address `2.4 to 3.0 migration guide` requirement regarding protocol for fetching shuffle blocks backward compatibility (and avoiding `IllegalArgumentException: Unexpected message type: <number>` kind of errors) The following modules were covered with workflow.xml related changes which resulted in successful integration tests execution: * `iis-wf-documentssimilarity` (explicitly excluded `hadoop-mapreduce-client-app` is still among spark342 sharelib dependencies what causes test failres) * `iis-wf-import` (infospace importer still fails due to spark3 regression, more details in #8941#note-35)
openaire · May 8, 2024 · fb2aec7 · fb2aec7
1 parent e6d1e28
commit fb2aec7
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 48 deletions.
diff --git a/iis-wf/iis-wf-documentssimilarity/pom.xml b/iis-wf/iis-wf-documentssimilarity/pom.xml
@@ -84,13 +84,33 @@
         <dependency>
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-mapreduce-client-core</artifactId>
+            <exclusions>
+                <!-- causes class conflicts when running on spark3 -->
+                <exclusion>
+                    <groupId>org.apache.hadoop</groupId>
+                    <artifactId>hadoop-yarn-api</artifactId>
+                </exclusion>
+            </exclusions>
         </dependency>
 
         <dependency>
             <groupId>pl.edu.icm.coansys</groupId>
             <artifactId>document-similarity-oap-uberworkflow</artifactId>
             <type>tar.gz</type>
             <classifier>oozie-job</classifier>
+            <exclusions>
+                <!-- causes class conflicts when running on spark3 -->
+                <exclusion>
+                    <groupId>org.apache.hadoop</groupId>
+                    <artifactId>hadoop-hdfs</artifactId>
+                </exclusion>
+                <!-- causes class conflicts when running on spark3 -->
+                <!-- FIXME also needs to be excluded from the spark342 sharelib folder -->
+                <exclusion>
+                    <groupId>org.apache.hadoop</groupId>
+                    <artifactId>hadoop-mapreduce-client-app</artifactId>
+                </exclusion>
+            </exclusions>
         </dependency>
 
         <dependency>

diff --git a/.../eu/dnetlib/iis/wf/documentssimilarity/avro_to_protobuf/sampletest/oozie_app/workflow.xml b/.../eu/dnetlib/iis/wf/documentssimilarity/avro_to_protobuf/sampletest/oozie_app/workflow.xml
@@ -18,16 +18,6 @@
             <name>oozieActionShareLibForSpark2</name>
             <description>oozie action sharelib for spark 2.*</description>
         </property>
-        <property>
-            <name>spark2ExtraListeners</name>
-            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
-            <description>spark 2.* extra listeners classname</description>
-        </property>
-        <property>
-            <name>spark2SqlQueryExecutionListeners</name>
-            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
-            <description>spark 2.* sql query execution listeners classname</description>
-        </property>
         <property>
             <name>spark2YarnHistoryServerAddress</name>
             <description>spark 2.* yarn history server address</description>
@@ -110,8 +100,9 @@
                 --executor-memory=${sparkExecutorMemory}
                 --executor-cores=${sparkExecutorCores}
                 --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.extraListeners=
+                --conf spark.sql.queryExecutionListeners=
+                --conf spark.shuffle.useOldFetchProtocol=true
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
             </spark-opts>

diff --git a/...main/resources/eu/dnetlib/iis/wf/importer/content_url/core_parquet/oozie_app/workflow.xml b/...main/resources/eu/dnetlib/iis/wf/importer/content_url/core_parquet/oozie_app/workflow.xml
@@ -39,16 +39,6 @@
             <name>oozieActionShareLibForSpark2</name>
             <description>oozie action sharelib for spark 2.*</description>
         </property>
-        <property>
-            <name>spark2ExtraListeners</name>
-            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
-            <description>spark 2.* extra listeners classname</description>
-        </property>
-        <property>
-            <name>spark2SqlQueryExecutionListeners</name>
-            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
-            <description>spark 2.* sql query execution listeners classname</description>
-        </property>
         <property>
             <name>spark2YarnHistoryServerAddress</name>
             <description>spark 2.* yarn history server address</description>
@@ -91,8 +81,9 @@
                 --executor-memory=${sparkExecutorMemory}
                 --executor-cores=${sparkExecutorCores}
                 --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.extraListeners=
+                --conf spark.sql.queryExecutionListeners=
+                --conf spark.shuffle.useOldFetchProtocol=true
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
             </spark-opts>

diff --git a/...-wf-import/src/main/resources/eu/dnetlib/iis/wf/importer/infospace/oozie_app/workflow.xml b/...-wf-import/src/main/resources/eu/dnetlib/iis/wf/importer/infospace/oozie_app/workflow.xml
@@ -135,16 +135,6 @@
             <name>oozieActionShareLibForSpark2</name>
             <description>oozie action sharelib for spark 2.*</description>
         </property>
-        <property>
-            <name>spark2ExtraListeners</name>
-            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
-            <description>spark 2.* extra listeners classname</description>
-        </property>
-        <property>
-            <name>spark2SqlQueryExecutionListeners</name>
-            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
-            <description>spark 2.* sql query execution listeners classname</description>
-        </property>
         <property>
             <name>spark2YarnHistoryServerAddress</name>
             <description>spark 2.* yarn history server address</description>
@@ -187,8 +177,9 @@
                 --executor-memory=${sparkExecutorMemory}
                 --executor-cores=${sparkExecutorCores}
                 --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.extraListeners=
+                --conf spark.sql.queryExecutionListeners=
+                --conf spark.shuffle.useOldFetchProtocol=true
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
             </spark-opts>

diff --git a/...iis-wf-import/src/main/resources/eu/dnetlib/iis/wf/importer/patent/oozie_app/workflow.xml b/...iis-wf-import/src/main/resources/eu/dnetlib/iis/wf/importer/patent/oozie_app/workflow.xml
@@ -34,16 +34,6 @@
             <name>oozieActionShareLibForSpark2</name>
             <description>oozie action sharelib for spark 2.*</description>
         </property>
-        <property>
-            <name>spark2ExtraListeners</name>
-            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
-            <description>spark 2.* extra listeners classname</description>
-        </property>
-        <property>
-            <name>spark2SqlQueryExecutionListeners</name>
-            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
-            <description>spark 2.* sql query execution listeners classname</description>
-        </property>
         <property>
             <name>spark2YarnHistoryServerAddress</name>
             <description>spark 2.* yarn history server address</description>
@@ -86,8 +76,9 @@
                 --executor-memory=${sparkExecutorMemory}
                 --executor-cores=${sparkExecutorCores}
                 --driver-memory=${sparkDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.extraListeners=
+                --conf spark.sql.queryExecutionListeners=
+                --conf spark.shuffle.useOldFetchProtocol=true
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
             </spark-opts>