Skip to content

Commit

Permalink
Closes #1426: Run IIS experiments by relying on spark 3.4 version
Browse files Browse the repository at this point in the history
WIP.

Fixing the changed results order in patent and software entity exporter integration tests.

Introducing required fixes for various `iis-wf-export-actionmanager` exporters relying on spark3 to let their integration tests to succeed:
* setting `spark.extraListeners` and `spark.sql.queryExecutionListeners` explicitly to empty values in order to avoid relying on incompatible, spark2 compliant, cloudera listeners
* setting `spark.shuffle.useOldFetchProtocol=true` in order to address `2.4 to 3.0 migration guide` requirement regarding protocol for fetching shuffle blocks backward compatibility (and avoiding `IllegalArgumentException: Unexpected message type: <number>` kind of errors)
The following modules were covered with similar workflow.xml related changes but their spark3 compatibility was not fully tested yet:
* `iis-wf-affmatching`
* `iis-wf-citationmatching-direct`
* `iis-wf-citationmatching`
* `iis-wf-documentsclassification`
* `iis-wf-import` (`content_url/core_parquet`, `infospace`, `patent`)
* `iis-wf-referenceextraction` (`community`, `concept`, `covid19`, `patent`, `project/funder_report`, `researchinitiative`, `softwareurl`)
* `iis-wf-transformers` (`avro2json`)
  • Loading branch information
marekhorst committed May 8, 2024
1 parent 2e40641 commit 337b8ef
Show file tree
Hide file tree
Showing 22 changed files with 41 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@

<action name="affmatching_dedup">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>document-organization deduplication</name>
<class>eu.dnetlib.iis.wf.affmatching.AffMatchingDedupJob</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@
<action name="affiliation-matching">
<spark xmlns="uri:oozie:spark-action:0.2">

<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>Affiliation Matching</name>
<class>eu.dnetlib.iis.wf.affmatching.AffMatchingJob</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@
<action name="proj-based-matching">
<spark xmlns="uri:oozie:spark-action:0.2">

<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>Project based Document-Organization Matching</name>
<class>eu.dnetlib.iis.wf.affmatching.ProjectBasedMatchingJob</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
<action name="citationmatchig-direct">
<spark xmlns="uri:oozie:spark-action:0.2">

<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>citationmatching_direct</name>
<class>eu.dnetlib.iis.wf.citationmatching.direct.CitationMatchingDirectJob</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@

<action name="citation-matching-input-transformer">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>citation-matching-input-transformer</name>

Expand Down Expand Up @@ -132,7 +132,7 @@

<action name="citation-matching">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>citation-matching</name>

Expand Down Expand Up @@ -165,7 +165,7 @@

<action name="citation-matching-output-transformer">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>citation-matching-output-transformer</name>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
<action name="document_classification_job">
<spark xmlns="uri:oozie:spark-action:0.2">

<master>yarn-cluster</master>
<master>yarn</master>

<mode>cluster</mode>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,16 +90,6 @@
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
Expand Down Expand Up @@ -133,7 +123,7 @@

<action name="patent-exporter">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>patent spark exporter</name>
<class>eu.dnetlib.iis.wf.export.actionmanager.entity.patent.PatentExporterJob</class>
Expand All @@ -142,8 +132,9 @@
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.extraListeners=
--conf spark.sql.queryExecutionListeners=
--conf spark.shuffle.useOldFetchProtocol=true
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,16 +59,6 @@
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
Expand Down Expand Up @@ -103,7 +93,7 @@
<action name="software-exporter">
<spark xmlns="uri:oozie:spark-action:0.2">

<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>software spark exporter</name>
<class>eu.dnetlib.iis.wf.export.actionmanager.entity.software.SoftwareExporterJob</class>
Expand All @@ -113,8 +103,9 @@
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.extraListeners=
--conf spark.sql.queryExecutionListeners=
--conf spark.shuffle.useOldFetchProtocol=true
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,12 @@
<arg>eu.dnetlib.iis.wf.export.actionmanager.sequencefile.TestingConsumer</arg>
<arg>-Iseqfile=${workingDir}/output/document_patent/patent-actionset-id</arg>
<arg>
-Pexpectation_file_paths=/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/default/output/document2_to_patent2.properties,
/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/default/output/patent2_to_document2.properties,
-Pexpectation_file_paths=/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/default/output/patent2_to_document2.properties,
/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/default/output/document1_to_patent1.properties,
/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/default/output/patent1_to_document1.properties,
/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/default/output/document2_to_patent1.properties,
/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/default/output/patent1_to_document2.properties
/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/default/output/patent1_to_document2.properties,
/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/default/output/document2_to_patent2.properties
</arg>
</java>
<ok to="patent-entity-consumer"/>
Expand All @@ -119,8 +119,8 @@
<arg>eu.dnetlib.iis.wf.export.actionmanager.sequencefile.TestingConsumer</arg>
<arg>-Iseqfile=${workingDir}/output/entities_patent/patent-entity-actionset-id</arg>
<arg>
-Pexpectation_file_paths=/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/default/output/patent1.expectations,
/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/default/output/patent2.expectations
-Pexpectation_file_paths=/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/default/output/patent2.expectations,
/eu/dnetlib/iis/wf/export/actionmanager/entity/patent/default/output/patent1.expectations
</arg>
</java>
<ok to="report-consumer"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@
<main-class>eu.dnetlib.iis.common.java.ProcessWrapper</main-class>
<arg>eu.dnetlib.iis.wf.export.actionmanager.sequencefile.TestingConsumer</arg>
<arg>-Iseqfile=${workingDir}/output/document_software_url/software-actionset-id</arg>
<arg>-Pexpectation_file_paths=/eu/dnetlib/iis/wf/export/actionmanager/entity/software/default/output/document_to_software.properties,
/eu/dnetlib/iis/wf/export/actionmanager/entity/software/default/output/software_to_document.properties
<arg>-Pexpectation_file_paths=/eu/dnetlib/iis/wf/export/actionmanager/entity/software/default/output/software_to_document.properties,
/eu/dnetlib/iis/wf/export/actionmanager/entity/software/default/output/document_to_software.properties
</arg>
</java>
<ok to="report-consumer" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@
<main-class>eu.dnetlib.iis.common.java.ProcessWrapper</main-class>
<arg>eu.dnetlib.iis.wf.export.actionmanager.sequencefile.TestingConsumer</arg>
<arg>-Iseqfile=${workingDir}/output/document_software_url/software-actionset-id</arg>
<arg>-Pexpectation_file_paths=/eu/dnetlib/iis/wf/export/actionmanager/entity/software/heritage/output/document_to_software.properties,
/eu/dnetlib/iis/wf/export/actionmanager/entity/software/heritage/output/software_to_document.properties
<arg>-Pexpectation_file_paths=/eu/dnetlib/iis/wf/export/actionmanager/entity/software/heritage/output/software_to_document.properties,
/eu/dnetlib/iis/wf/export/actionmanager/entity/software/heritage/output/document_to_software.properties
</arg>
</java>
<ok to="report-consumer" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@

<action name="content-url-importer">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>import-content-url_core</name>
<class>eu.dnetlib.iis.wf.importer.content.HiveBasedDocumentContentUrlImporterJob</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@

<action name="infospace-importer">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>import-infospace-hdp</name>
<class>eu.dnetlib.iis.wf.importer.infospace.ImportInformationSpaceJob</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@

<action name="patent-importer">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>import-patent</name>
<class>eu.dnetlib.iis.wf.importer.patent.PatentReaderJob</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
</property>
</configuration>

<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>community-referenceextraction-input-transformer</name>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
</property>
</configuration>

<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>referenceextraction_root_conceptid_report</name>
<class>eu.dnetlib.iis.wf.referenceextraction.concept.RootConceptIdReportJob</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>covid19-referenceextraction-input-transformer</name>
<class>eu.dnetlib.iis.wf.referenceextraction.covid19.input.Covid19ReferenceExtractionInputTransformerJob</class>
Expand Down Expand Up @@ -185,7 +185,7 @@
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>covid19-referenceextraction-output-transformer</name>
<class>eu.dnetlib.iis.wf.referenceextraction.covid19.output.Covid19ToConceptIdTransformerJob</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@

<action name="input_transformer">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>patent-referenceextraction-input-transformer</name>
<class>eu.dnetlib.iis.wf.referenceextraction.patent.input.PatentReferenceExtractionInputTransformerJob</class>
Expand Down Expand Up @@ -225,7 +225,7 @@

<action name="metadata_retriever_input_transformer">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>patent-metadata-retriever-input-transformer</name>
<class>eu.dnetlib.iis.wf.referenceextraction.patent.input.PatentMetadataRetrieverInputTransformerJob</class>
Expand All @@ -249,7 +249,7 @@

<action name="metadata_retriever">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>patent-metadata-retriever</name>
<class>eu.dnetlib.iis.wf.referenceextraction.patent.PatentMetadataRetrieverJob</class>
Expand Down Expand Up @@ -292,7 +292,7 @@

<action name="metadata_extractor">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>patent-metadata-extractor</name>
<class>eu.dnetlib.iis.wf.referenceextraction.patent.PatentMetadataExtractorJob</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>referenceextraction_project_funding_report</name>
<class>eu.dnetlib.iis.wf.referenceextraction.project.ProjectFunderReportJob</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>researchinitiative-input-metadata-transformer</name>
<class>eu.dnetlib.iis.wf.referenceextraction.researchinitiative.ResearchInitiativeReferenceExtractionInputTransformerJob</class>
Expand Down Expand Up @@ -107,7 +107,7 @@
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>researchinitiative-concept-transformer</name>
<class>eu.dnetlib.iis.wf.referenceextraction.researchinitiative.ResearchInitiativeMetadataTransformerJob</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@

<action name="cached-webcrawl">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>referenceextraction_softwareurl_webcrawl</name>
<class>eu.dnetlib.iis.wf.referenceextraction.softwareurl.CachedWebCrawlerJob</class>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@

<action name="transformer">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<master>yarn</master>
<mode>cluster</mode>
<name>avro2json_transformer_job</name>
<class>eu.dnetlib.iis.wf.transformers.avro2json.Avro2JsonTransformer</class>
Expand Down

0 comments on commit 337b8ef

Please sign in to comment.