diff --git a/nixos/modules/services/cluster/hadoop/conf.nix b/nixos/modules/services/cluster/hadoop/conf.nix index 38db10406b9a0d6..69472408cabe59b 100644 --- a/nixos/modules/services/cluster/hadoop/conf.nix +++ b/nixos/modules/services/cluster/hadoop/conf.nix @@ -1,4 +1,4 @@ -{ hadoop, pkgs }: +{ cfg, pkgs, lib }: let propertyXml = name: value: '' @@ -13,19 +13,31 @@ let ${builtins.concatStringsSep "\n" (pkgs.lib.mapAttrsToList propertyXml properties)} ''; + cfgLine = name: value: '' + ${name}=${builtins.toString value} + ''; + cfgFile = fileName: properties: pkgs.writeTextDir fileName '' + # generated by NixOS + ${builtins.concatStringsSep "" (pkgs.lib.mapAttrsToList cfgLine properties)} + ''; userFunctions = '' hadoop_verify_logdir() { echo Skipping verification of log directory } ''; + hadoopEnv = '' + export HADOOP_LOG_DIR=/tmp/hadoop/$USER + ''; in -pkgs.buildEnv { - name = "hadoop-conf"; - paths = [ - (siteXml "core-site.xml" hadoop.coreSite) - (siteXml "hdfs-site.xml" hadoop.hdfsSite) - (siteXml "mapred-site.xml" hadoop.mapredSite) - (siteXml "yarn-site.xml" hadoop.yarnSite) - (pkgs.writeTextDir "hadoop-user-functions.sh" userFunctions) - ]; -} +pkgs.runCommand "hadoop-conf" {} '' + mkdir -p $out/ + cp ${siteXml "core-site.xml" cfg.coreSite}/* $out/ + cp ${siteXml "hdfs-site.xml" cfg.hdfsSite}/* $out/ + cp ${siteXml "mapred-site.xml" cfg.mapredSite}/* $out/ + cp ${siteXml "yarn-site.xml" cfg.yarnSite}/* $out/ + cp ${cfgFile "container-executor.cfg" cfg.containerExecutorCfg}/* $out/ + cp ${pkgs.writeTextDir "hadoop-user-functions.sh" userFunctions}/* $out/ + cp ${pkgs.writeTextDir "hadoop-env.sh" hadoopEnv}/* $out/ + cp ${cfg.log4jProperties} $out/log4j.properties + ${lib.concatMapStringsSep "\n" (dir: "cp -r ${dir}/* $out/") cfg.extraConfDirs} +'' diff --git a/nixos/modules/services/cluster/hadoop/default.nix b/nixos/modules/services/cluster/hadoop/default.nix index a165f619dc0c15e..da3e47b95d4dd30 100644 --- a/nixos/modules/services/cluster/hadoop/default.nix +++ b/nixos/modules/services/cluster/hadoop/default.nix @@ -1,5 +1,7 @@ { config, lib, pkgs, ...}: - +let + cfg = config.services.hadoop; +in with lib; { imports = [ ./yarn.nix ./hdfs.nix ]; @@ -17,7 +19,9 @@ with lib; }; hdfsSite = mkOption { - default = {}; + default = { + "dfs.namenode.rpc-bind-host" = "0.0.0.0"; + }; type = types.attrsOf types.anything; example = literalExpression '' { @@ -28,27 +32,81 @@ with lib; }; mapredSite = mkOption { - default = {}; + default = { + "mapreduce.framework.name" = "yarn"; + "yarn.app.mapreduce.am.env" = "HADOOP_MAPRED_HOME=${cfg.package}/lib/${cfg.package.untarDir}"; + "mapreduce.map.env" = "HADOOP_MAPRED_HOME=${cfg.package}/lib/${cfg.package.untarDir}"; + "mapreduce.reduce.env" = "HADOOP_MAPRED_HOME=${cfg.package}/lib/${cfg.package.untarDir}"; + }; type = types.attrsOf types.anything; example = literalExpression '' - { - "mapreduce.map.cpu.vcores" = "1"; + options.services.hadoop.mapredSite.default // { + "mapreduce.map.java.opts" = "-Xmx900m -XX:+UseParallelGC"; } ''; description = "Hadoop mapred-site.xml definition"; }; yarnSite = mkOption { - default = {}; + default = { + "yarn.nodemanager.admin-env" = "PATH=$PATH"; + "yarn.nodemanager.aux-services" = "mapreduce_shuffle"; + "yarn.nodemanager.aux-services.mapreduce_shuffle.class" = "org.apache.hadoop.mapred.ShuffleHandler"; + "yarn.nodemanager.bind-host" = "0.0.0.0"; + "yarn.nodemanager.container-executor.class" = "org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor"; + "yarn.nodemanager.env-whitelist" = "JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,LANG,TZ"; + "yarn.nodemanager.linux-container-executor.group" = "hadoop"; + "yarn.nodemanager.linux-container-executor.path" = "/run/wrappers/yarn-nodemanager/bin/container-executor"; + "yarn.nodemanager.log-dirs" = "/var/log/hadoop/yarn/nodemanager"; + "yarn.resourcemanager.bind-host" = "0.0.0.0"; + "yarn.resourcemanager.scheduler.class" = "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler"; + }; type = types.attrsOf types.anything; example = literalExpression '' - { - "yarn.resourcemanager.ha.id" = "resourcemanager1"; + options.services.hadoop.yarnSite.default // { + "yarn.resourcemanager.hostname" = "''${config.networking.hostName}"; } ''; description = "Hadoop yarn-site.xml definition"; }; + log4jProperties = mkOption { + default = "${cfg.package}/lib/${cfg.package.untarDir}/etc/hadoop/log4j.properties"; + type = types.path; + example = literalExpression '' + "''${pkgs.hadoop}/lib/''${pkgs.hadoop.untarDir}/etc/hadoop/log4j.properties"; + ''; + description = "log4j.properties file added to HADOOP_CONF_DIR"; + }; + + containerExecutorCfg = mkOption { + default = { + # must be the same as yarn.nodemanager.linux-container-executor.group in yarnSite + "yarn.nodemanager.linux-container-executor.group"="hadoop"; + "min.user.id"=1000; + "feature.terminal.enabled"=1; + }; + type = types.attrsOf types.anything; + example = literalExpression '' + options.services.hadoop.containerExecutorCfg.default // { + "feature.terminal.enabled" = 0; + } + ''; + description = "Yarn container-executor.cfg definition"; + }; + + extraConfDirs = mkOption { + default = []; + type = types.listOf types.path; + example = literalExpression '' + [ + ./extraHDFSConfs + ./extraYARNConfs + ] + ''; + description = "Directories containing additional config files to be added to HADOOP_CONF_DIR"; + }; + package = mkOption { type = types.package; default = pkgs.hadoop; @@ -64,6 +122,12 @@ with lib; users.groups.hadoop = { gid = config.ids.gids.hadoop; }; + environment = { + systemPackages = [ cfg.package ]; + etc."hadoop-conf".source = let + hadoopConf = "${import ./conf.nix { inherit cfg pkgs lib; }}/"; + in "${hadoopConf}"; + }; }) ]; diff --git a/nixos/modules/services/cluster/hadoop/hdfs.nix b/nixos/modules/services/cluster/hadoop/hdfs.nix index 4f4b0a92108fa4e..e347b682b902004 100644 --- a/nixos/modules/services/cluster/hadoop/hdfs.nix +++ b/nixos/modules/services/cluster/hadoop/hdfs.nix @@ -1,24 +1,54 @@ { config, lib, pkgs, ...}: +with lib; let cfg = config.services.hadoop; - hadoopConf = import ./conf.nix { hadoop = cfg; pkgs = pkgs; }; + hadoopConf = "${import ./conf.nix { inherit cfg pkgs lib; }}/"; + restartIfChanged = mkOption { + type = types.bool; + description = '' + Automatically restart the service on config change. + This can be set to false to defer restarts on clusters running critical applications. + Please consider the security implications of inadvertently running an older version, + and the possibility of unexpected behavior caused by inconsistent versions across a cluster when disabling this option. + ''; + default = false; + }; in -with lib; { options.services.hadoop.hdfs = { - namenode.enabled = mkOption { - type = types.bool; - default = false; - description = '' - Whether to run the Hadoop YARN NameNode - ''; + namenode = { + enabled = mkOption { + type = types.bool; + default = false; + description = '' + Whether to run the HDFS NameNode + ''; + }; + inherit restartIfChanged; + openFirewall = mkOption { + type = types.bool; + default = true; + description = '' + Open firewall ports for namenode + ''; + }; }; - datanode.enabled = mkOption { - type = types.bool; - default = false; - description = '' - Whether to run the Hadoop YARN DataNode - ''; + datanode = { + enabled = mkOption { + type = types.bool; + default = false; + description = '' + Whether to run the HDFS DataNode + ''; + }; + inherit restartIfChanged; + openFirewall = mkOption { + type = types.bool; + default = true; + description = '' + Open firewall ports for datanode + ''; + }; }; }; @@ -27,10 +57,7 @@ with lib; systemd.services.hdfs-namenode = { description = "Hadoop HDFS NameNode"; wantedBy = [ "multi-user.target" ]; - - environment = { - HADOOP_HOME = "${cfg.package}"; - }; + inherit (cfg.hdfs.namenode) restartIfChanged; preStart = '' ${cfg.package}/bin/hdfs --config ${hadoopConf} namenode -format -nonInteractive || true @@ -40,24 +67,34 @@ with lib; User = "hdfs"; SyslogIdentifier = "hdfs-namenode"; ExecStart = "${cfg.package}/bin/hdfs --config ${hadoopConf} namenode"; + Restart = "always"; }; }; + + networking.firewall.allowedTCPPorts = (mkIf cfg.hdfs.namenode.openFirewall [ + 9870 # namenode.http-address + 8020 # namenode.rpc-address + ]); }) (mkIf cfg.hdfs.datanode.enabled { systemd.services.hdfs-datanode = { description = "Hadoop HDFS DataNode"; wantedBy = [ "multi-user.target" ]; - - environment = { - HADOOP_HOME = "${cfg.package}"; - }; + inherit (cfg.hdfs.datanode) restartIfChanged; serviceConfig = { User = "hdfs"; SyslogIdentifier = "hdfs-datanode"; ExecStart = "${cfg.package}/bin/hdfs --config ${hadoopConf} datanode"; + Restart = "always"; }; }; + + networking.firewall.allowedTCPPorts = (mkIf cfg.hdfs.datanode.openFirewall [ + 9864 # datanode.http.address + 9866 # datanode.address + 9867 # datanode.ipc.address + ]); }) (mkIf ( cfg.hdfs.namenode.enabled || cfg.hdfs.datanode.enabled diff --git a/nixos/modules/services/cluster/hadoop/yarn.nix b/nixos/modules/services/cluster/hadoop/yarn.nix index c92020637e476a4..0086a53e3b74f41 100644 --- a/nixos/modules/services/cluster/hadoop/yarn.nix +++ b/nixos/modules/services/cluster/hadoop/yarn.nix @@ -1,24 +1,62 @@ { config, lib, pkgs, ...}: +with lib; let cfg = config.services.hadoop; - hadoopConf = import ./conf.nix { hadoop = cfg; pkgs = pkgs; }; + hadoopConf = "${import ./conf.nix { inherit cfg pkgs lib; }}/"; + restartIfChanged = mkOption { + type = types.bool; + description = '' + Automatically restart the service on config change. + This can be set to false to defer restarts on clusters running critical applications. + Please consider the security implications of inadvertently running an older version, + and the possibility of unexpected behavior caused by inconsistent versions across a cluster when disabling this option. + ''; + default = false; + }; in -with lib; { options.services.hadoop.yarn = { - resourcemanager.enabled = mkOption { - type = types.bool; - default = false; - description = '' - Whether to run the Hadoop YARN ResourceManager - ''; + resourcemanager = { + enabled = mkOption { + type = types.bool; + default = false; + description = '' + Whether to run the Hadoop YARN ResourceManager + ''; + }; + inherit restartIfChanged; + openFirewall = mkOption { + type = types.bool; + default = true; + description = '' + Open firewall ports for resourcemanager + ''; + }; }; - nodemanager.enabled = mkOption { - type = types.bool; - default = false; - description = '' - Whether to run the Hadoop YARN NodeManager - ''; + nodemanager = { + enabled = mkOption { + type = types.bool; + default = false; + description = '' + Whether to run the Hadoop YARN NodeManager + ''; + }; + inherit restartIfChanged; + addBinBash = mkOption { + type = types.bool; + default = true; + description = '' + Add /bin/bash. This is needed by the linux container executor's launch script. + ''; + }; + openFirewall = mkOption { + type = types.bool; + default = true; + description = '' + Open firewall ports for nodemanager. + Because containers can listen on any ephemeral port, TCP ports 1024–65535 will be opened. + ''; + }; }; }; @@ -38,36 +76,63 @@ with lib; systemd.services.yarn-resourcemanager = { description = "Hadoop YARN ResourceManager"; wantedBy = [ "multi-user.target" ]; - - environment = { - HADOOP_HOME = "${cfg.package}"; - }; + inherit (cfg.yarn.resourcemanager) restartIfChanged; serviceConfig = { User = "yarn"; SyslogIdentifier = "yarn-resourcemanager"; ExecStart = "${cfg.package}/bin/yarn --config ${hadoopConf} " + " resourcemanager"; + Restart = "always"; }; }; + networking.firewall.allowedTCPPorts = (mkIf cfg.yarn.resourcemanager.openFirewall [ + 8088 # resourcemanager.webapp.address + 8030 # resourcemanager.scheduler.address + 8031 # resourcemanager.resource-tracker.address + 8032 # resourcemanager.address + ]); }) (mkIf cfg.yarn.nodemanager.enabled { + # Needed because yarn hardcodes /bin/bash in container start scripts + # These scripts can't be patched, they are generated at runtime + systemd.tmpfiles.rules = [ + (mkIf cfg.yarn.nodemanager.addBinBash "L /bin/bash - - - - /run/current-system/sw/bin/bash") + ]; + systemd.services.yarn-nodemanager = { description = "Hadoop YARN NodeManager"; wantedBy = [ "multi-user.target" ]; + inherit (cfg.yarn.nodemanager) restartIfChanged; - environment = { - HADOOP_HOME = "${cfg.package}"; - }; + preStart = '' + # create log dir + mkdir -p /var/log/hadoop/yarn/nodemanager + chown yarn:hadoop /var/log/hadoop/yarn/nodemanager + + # set up setuid container executor binary + rm -rf /run/wrappers/yarn-nodemanager/ || true + mkdir -p /run/wrappers/yarn-nodemanager/{bin,etc/hadoop} + cp ${cfg.package}/lib/${cfg.package.untarDir}/bin/container-executor /run/wrappers/yarn-nodemanager/bin/ + chgrp hadoop /run/wrappers/yarn-nodemanager/bin/container-executor + chmod 6050 /run/wrappers/yarn-nodemanager/bin/container-executor + cp ${hadoopConf}/container-executor.cfg /run/wrappers/yarn-nodemanager/etc/hadoop/ + ''; serviceConfig = { User = "yarn"; SyslogIdentifier = "yarn-nodemanager"; + PermissionsStartOnly = true; ExecStart = "${cfg.package}/bin/yarn --config ${hadoopConf} " + " nodemanager"; + Restart = "always"; }; }; + + networking.firewall.allowedTCPPortRanges = [ + (mkIf (cfg.yarn.nodemanager.openFirewall) {from = 1024; to = 65535;}) + ]; }) ]; diff --git a/nixos/tests/all-tests.nix b/nixos/tests/all-tests.nix index 12b67008291ecc3..ea7d40b34e1dd47 100644 --- a/nixos/tests/all-tests.nix +++ b/nixos/tests/all-tests.nix @@ -165,6 +165,7 @@ in grocy = handleTest ./grocy.nix {}; grub = handleTest ./grub.nix {}; gvisor = handleTest ./gvisor.nix {}; + hadoop.all = handleTestOn [ "x86_64-linux" ] ./hadoop/hadoop.nix {}; hadoop.hdfs = handleTestOn [ "x86_64-linux" ] ./hadoop/hdfs.nix {}; hadoop.yarn = handleTestOn [ "x86_64-linux" ] ./hadoop/yarn.nix {}; handbrake = handleTestOn ["x86_64-linux"] ./handbrake.nix {}; @@ -414,6 +415,7 @@ in solr = handleTest ./solr.nix {}; sonarr = handleTest ./sonarr.nix {}; spacecookie = handleTest ./spacecookie.nix {}; + spark = handleTestOn ["x86_64-linux"] ./spark {}; spike = handleTest ./spike.nix {}; sslh = handleTest ./sslh.nix {}; sssd = handleTestOn ["x86_64-linux"] ./sssd.nix {}; diff --git a/nixos/tests/hadoop/hadoop.nix b/nixos/tests/hadoop/hadoop.nix new file mode 100644 index 000000000000000..46dfac26e065b85 --- /dev/null +++ b/nixos/tests/hadoop/hadoop.nix @@ -0,0 +1,70 @@ +import ../make-test-python.nix ({pkgs, ...}: { + + nodes = let + package = pkgs.hadoop; + coreSite = { + "fs.defaultFS" = "hdfs://master"; + }; + in { + master = {pkgs, options, ...}: { + services.hadoop = { + inherit package coreSite; + hdfs.namenode.enabled = true; + yarn.resourcemanager.enabled = true; + }; + virtualisation.memorySize = 1024; + }; + + worker = {pkgs, options, ...}: { + services.hadoop = { + inherit package coreSite; + hdfs.datanode.enabled = true; + yarn.nodemanager.enabled = true; + yarnSite = options.services.hadoop.yarnSite.default // { + "yarn.resourcemanager.hostname" = "master"; + }; + }; + virtualisation.memorySize = 2048; + }; + }; + + testScript = '' + start_all() + + master.wait_for_unit("network.target") + master.wait_for_unit("hdfs-namenode") + + master.wait_for_open_port(8020) + master.wait_for_open_port(9870) + + worker.wait_for_unit("network.target") + worker.wait_for_unit("hdfs-datanode") + worker.wait_for_open_port(9864) + worker.wait_for_open_port(9866) + worker.wait_for_open_port(9867) + + master.succeed("curl -f http://worker:9864") + worker.succeed("curl -f http://master:9870") + + worker.succeed("sudo -u hdfs hdfs dfsadmin -safemode wait") + + master.wait_for_unit("yarn-resourcemanager") + + master.wait_for_open_port(8030) + master.wait_for_open_port(8031) + master.wait_for_open_port(8032) + master.wait_for_open_port(8088) + worker.succeed("curl -f http://master:8088") + + worker.wait_for_unit("yarn-nodemanager") + worker.wait_for_open_port(8042) + worker.wait_for_open_port(8040) + master.succeed("curl -f http://worker:8042") + + assert "Total Nodes:1" in worker.succeed("yarn node -list") + + assert "Estimated value of Pi is" in worker.succeed("HADOOP_USER_NAME=hdfs yarn jar $(readlink $(which yarn) | sed -r 's~bin/yarn~lib/hadoop-*/share/hadoop/mapreduce/hadoop-mapreduce-examples-*.jar~g') pi 2 10") + assert "SUCCEEDED" in worker.succeed("yarn application -list -appStates FINISHED") + worker.succeed("sudo -u hdfs hdfs dfs -ls / | systemd-cat") + ''; + }) diff --git a/nixos/tests/hadoop/hdfs.nix b/nixos/tests/hadoop/hdfs.nix index f1f98ed42eb31ab..f5907185c039eca 100644 --- a/nixos/tests/hadoop/hdfs.nix +++ b/nixos/tests/hadoop/hdfs.nix @@ -2,7 +2,7 @@ import ../make-test-python.nix ({...}: { nodes = { namenode = {pkgs, ...}: { services.hadoop = { - package = pkgs.hadoop_3_1; + package = pkgs.hadoop; hdfs.namenode.enabled = true; coreSite = { "fs.defaultFS" = "hdfs://namenode:8020"; @@ -20,7 +20,7 @@ import ../make-test-python.nix ({...}: { }; datanode = {pkgs, ...}: { services.hadoop = { - package = pkgs.hadoop_3_1; + package = pkgs.hadoop; hdfs.datanode.enabled = true; coreSite = { "fs.defaultFS" = "hdfs://namenode:8020"; diff --git a/nixos/tests/hadoop/yarn.nix b/nixos/tests/hadoop/yarn.nix index 01077245d397380..fbbb293eecd6b1b 100644 --- a/nixos/tests/hadoop/yarn.nix +++ b/nixos/tests/hadoop/yarn.nix @@ -1,7 +1,7 @@ import ../make-test-python.nix ({...}: { nodes = { resourcemanager = {pkgs, ...}: { - services.hadoop.package = pkgs.hadoop_3_1; + services.hadoop.package = pkgs.hadoop; services.hadoop.yarn.resourcemanager.enabled = true; services.hadoop.yarnSite = { "yarn.resourcemanager.scheduler.class" = "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler"; @@ -12,7 +12,7 @@ import ../make-test-python.nix ({...}: { ]; }; nodemanager = {pkgs, ...}: { - services.hadoop.package = pkgs.hadoop_3_1; + services.hadoop.package = pkgs.hadoop; services.hadoop.yarn.nodemanager.enabled = true; services.hadoop.yarnSite = { "yarn.resourcemanager.hostname" = "resourcemanager"; diff --git a/pkgs/applications/networking/cluster/hadoop/default.nix b/pkgs/applications/networking/cluster/hadoop/default.nix index 589ce21fe87babb..0e8e652fbb30b8b 100644 --- a/pkgs/applications/networking/cluster/hadoop/default.nix +++ b/pkgs/applications/networking/cluster/hadoop/default.nix @@ -1,180 +1,96 @@ -{ lib, stdenv, fetchurl, makeWrapper, pkg-config, which, maven, cmake, jre, jdk8, bash -, coreutils, glibc, protobuf2_5, fuse, snappy, zlib, bzip2, openssl, openssl_1_0_2, fetchpatch, libtirpc +{ lib, stdenv, fetchurl, makeWrapper, autoPatchelfHook +, jdk8_headless, jdk11_headless +, bash, coreutils, which +, bzip2, cyrus_sasl , protobuf3_7, snappy, zlib, zstd +, openssl }: -let - maven-jdk8 = maven.override { - jdk = jdk8; - }; - common = { version, sha256, dependencies-sha256, maven, tomcat, opensslPkg ? openssl }: - let - # compile the hadoop tarball from sources, it requires some patches - binary-distributon = stdenv.mkDerivation rec { - name = "hadoop-${version}-bin"; - src = fetchurl { - url = "mirror://apache/hadoop/common/hadoop-${version}/hadoop-${version}-src.tar.gz"; - inherit sha256; - }; - - postUnpack = lib.optionalString (tomcat != null) '' - install -D ${tomcat.src} $sourceRoot/hadoop-hdfs-project/hadoop-hdfs-httpfs/downloads/apache-tomcat-${tomcat.version}.tar.gz - install -D ${tomcat.src} $sourceRoot/hadoop-common-project/hadoop-kms/downloads/apache-tomcat-${tomcat.version}.tar.gz - ''; - - # perform fake build to make a fixed-output derivation of dependencies downloaded from maven central (~100Mb in ~3000 files) - fetched-maven-deps = stdenv.mkDerivation { - name = "hadoop-${version}-maven-deps"; - inherit src postUnpack nativeBuildInputs buildInputs; - buildPhase = '' - while mvn package -Dmaven.repo.local=$out/.m2 ${mavenFlags} -Dmaven.wagon.rto=5000; [ $? = 1 ]; do - echo "timeout, restart maven to continue downloading" - done - ''; - # keep only *.{pom,jar,xml,sha1,so,dll,dylib} and delete all ephemeral files with lastModified timestamps inside - installPhase = ''find $out/.m2 -type f -regex '.+\(\.lastUpdated\|resolver-status\.properties\|_remote\.repositories\)' -delete''; - outputHashAlgo = "sha256"; - outputHashMode = "recursive"; - outputHash = dependencies-sha256; - }; +with lib; - nativeBuildInputs = [ maven cmake pkg-config ]; - buildInputs = [ fuse snappy zlib bzip2 opensslPkg protobuf2_5 libtirpc ]; - NIX_CFLAGS_COMPILE = [ "-I${libtirpc.dev}/include/tirpc" ]; - NIX_LDFLAGS = [ "-ltirpc" ]; - - # most of the hardcoded pathes are fixed in 2.9.x and 3.0.0, this list of patched files might be reduced when 2.7.x and 2.8.x will be deprecated - - patches = [ - (fetchpatch { - url = "https://patch-diff.githubusercontent.com/raw/apache/hadoop/pull/2886.patch"; - sha256 = "1fim1d8va050za5i8a6slphmx015fzvhxkc2wi4rwg7kbj31sv0r"; - }) - ]; - - postPatch = '' - for file in hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/HardLink.java \ - hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/Shell.java \ - hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java \ - hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DockerContainerExecutor.java \ - hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java \ - hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java; do - if [ -f "$file" ]; then - substituteInPlace "$file" \ - --replace '/usr/bin/stat' 'stat' \ - --replace '/bin/bash' 'bash' \ - --replace '/bin/ls' 'ls' \ - --replace '/bin/mv' 'mv' - fi - done - ''; - dontConfigure = true; # do not trigger cmake hook - mavenFlags = "-Drequire.snappy -Drequire.bzip2 -DskipTests -Pdist,native -e"; - buildPhase = '' - # 'maven.repo.local' must be writable - mvn package --offline -Dmaven.repo.local=$(cp -dpR ${fetched-maven-deps}/.m2 ./ && chmod +w -R .m2 && pwd)/.m2 ${mavenFlags} - # remove runtime dependency on $jdk/jre/lib/amd64/server/libjvm.so - patchelf --set-rpath ${lib.makeLibraryPath [glibc]} hadoop-dist/target/hadoop-${version}/lib/native/libhadoop.so.1.0.0 - patchelf --set-rpath ${lib.makeLibraryPath [glibc]} hadoop-dist/target/hadoop-${version}/lib/native/libhdfs.so.0.0.0 - ''; - installPhase = "mv hadoop-dist/target/hadoop-${version} $out"; +let + common = { pname, version, untarDir ? "${pname}-${version}", sha256, jdk, openssl, nativeLibs ? [ ], libPatches ? "" }: + stdenv.mkDerivation rec { + inherit pname version jdk libPatches untarDir openssl; + src = fetchurl { + url = "mirror://apache/hadoop/common/hadoop-${version}/hadoop-${version}.tar.gz"; + inherit sha256; }; - in - stdenv.mkDerivation { - pname = "hadoop"; - inherit version; - src = binary-distributon; + nativeBuildInputs = [ makeWrapper ] + ++ optional (nativeLibs != [] || libPatches != "") [ autoPatchelfHook ]; + buildInputs = [ openssl ] ++ nativeLibs; - nativeBuildInputs = [ makeWrapper ]; + installPhase = '' + mkdir -p $out/{lib/${untarDir}/conf,bin,lib} + mv * $out/lib/${untarDir} - installPhase = '' - mkdir -p $out/share/doc/hadoop - cp -dpR * $out/ - mv $out/*.txt $out/share/doc/hadoop/ + for n in $(find $out/lib/${untarDir}/bin -type f ! -name "*.*"); do + makeWrapper "$n" "$out/bin/$(basename $n)"\ + --set-default JAVA_HOME ${jdk.home}\ + --set-default HADOOP_HOME $out/lib/${untarDir}\ + --set-default HADOOP_CONF_DIR /etc/hadoop-conf/\ + --prefix PATH : "${makeBinPath [ bash coreutils which]}"\ + --prefix JAVA_LIBRARY_PATH : "${makeLibraryPath buildInputs}" + done + '' + libPatches; - # - # Do not use `wrapProgram` here, script renaming may result to weird things: http://i.imgur.com/0Xee013.png - # - mkdir -p $out/bin.wrapped - for n in $out/bin/*; do - if [ -f "$n" ]; then # only regular files - mv $n $out/bin.wrapped/ - makeWrapper $out/bin.wrapped/$(basename $n) $n \ - --prefix PATH : "${lib.makeBinPath [ which jre bash coreutils ]}" \ - --prefix JAVA_LIBRARY_PATH : "${lib.makeLibraryPath [ opensslPkg snappy zlib bzip2 ]}" \ - --set JAVA_HOME "${jre}" \ - --set HADOOP_PREFIX "$out" - fi - done - ''; + meta = { + homepage = "https://hadoop.apache.org/"; + description = "Framework for distributed processing of large data sets across clusters of computers"; + license = licenses.asl20; - meta = with lib; { - homepage = "https://hadoop.apache.org/"; - description = "Framework for distributed processing of large data sets across clusters of computers"; - license = licenses.asl20; - - longDescription = '' - The Apache Hadoop software library is a framework that allows for - the distributed processing of large data sets across clusters of - computers using a simple programming model. It is designed to - scale up from single servers to thousands of machines, each - offering local computation and storage. Rather than rely on - hardware to deliver high-avaiability, the library itself is - designed to detect and handle failures at the application layer, - so delivering a highly-availabile service on top of a cluster of - computers, each of which may be prone to failures. - ''; - maintainers = with maintainers; [ volth ]; - platforms = [ "x86_64-linux" ]; - }; + longDescription = '' + The Apache Hadoop software library is a framework that allows for + the distributed processing of large data sets across clusters of + computers using a simple programming model. It is designed to + scale up from single servers to thousands of machines, each + offering local computation and storage. Rather than rely on + hardware to deliver high-avaiability, the library itself is + designed to detect and handle failures at the application layer, + so delivering a highly-availabile service on top of a cluster of + computers, each of which may be prone to failures. + ''; + maintainers = with maintainers; [ volth illustris ]; + platforms = [ "x86_64-linux" ]; }; - tomcat_6_0_48 = rec { - version = "6.0.48"; - src = fetchurl { - # do not use "mirror://apache/" here, tomcat-6 is legacy and has been removed from the mirrors - url = "https://archive.apache.org/dist/tomcat/tomcat-6/v${version}/bin/apache-tomcat-${version}.tar.gz"; - sha256 = "1w4jf28g8p25fmijixw6b02iqlagy2rvr57y3n90hvz341kb0bbc"; }; +in +{ + # Different version of hadoop support different java runtime versions + # https://cwiki.apache.org/confluence/display/HADOOP/Hadoop+Java+Versions + hadoop_3_3 = common rec { + pname = "hadoop"; + version = "3.3.1"; + sha256 = "1b3v16ihysqaxw8za1r5jlnphy8dwhivdx2d0z64309w57ihlxxd"; + untarDir = "${pname}-${version}"; + jdk = jdk11_headless; + inherit openssl; + # TODO: Package and add Intel Storage Acceleration Library + nativeLibs = [ stdenv.cc.cc.lib protobuf3_7 zlib snappy ]; + libPatches = '' + ln -s ${getLib cyrus_sasl}/lib/libsasl2.so $out/lib/${untarDir}/lib/native/libsasl2.so.2 + ln -s ${getLib openssl}/lib/libcrypto.so $out/lib/${untarDir}/lib/native/ + ln -s ${getLib zlib}/lib/libz.so.1 $out/lib/${untarDir}/lib/native/ + ln -s ${getLib zstd}/lib/libzstd.so.1 $out/lib/${untarDir}/lib/native/ + ln -s ${getLib bzip2}/lib/libbz2.so.1 $out/lib/${untarDir}/lib/native/ + patchelf --add-rpath ${jdk.home}/lib/server $out/lib/${untarDir}/lib/native/libnativetask.so.1.0.0 + ''; }; - -in { - hadoop_2_7 = common { - version = "2.7.7"; - sha256 = "1ahv67f3lwak3kbjvnk1gncq56z6dksbajj872iqd0awdsj3p5rf"; - dependencies-sha256 = "1lsr9nvrynzspxqcamb10d596zlnmnfpxhkd884gdiva0frm0b1r"; - tomcat = tomcat_6_0_48; - opensslPkg = openssl_1_0_2; - maven = maven-jdk8; - }; - hadoop_2_8 = common { - version = "2.8.4"; - sha256 = "16c3ljhrzibkjn3y1bmjxdgf0kn60l23ay5hqpp7vpbnqx52x68w"; - dependencies-sha256 = "1j4f461487fydgr5978nnm245ksv4xbvskfr8pbmfhcyss6b7w03"; - tomcat = tomcat_6_0_48; - opensslPkg = openssl_1_0_2; - maven = maven-jdk8; - }; - hadoop_2_9 = common { - version = "2.9.1"; - sha256 = "0qgmpfbpv7f521fkjy5ldzdb4lwiblhs0hyl8qy041ws17y5x7d7"; - dependencies-sha256 = "1d5i8jj5y746rrqb9lscycnd7acmxlkz64ydsiyqsh5cdqgy2x7x"; - tomcat = tomcat_6_0_48; - opensslPkg = openssl_1_0_2; - maven = maven-jdk8; - }; - hadoop_3_0 = common { - version = "3.0.3"; - sha256 = "1vvkci0kx4b48dg0niifn2d3r4wwq8pb3c5z20wy8pqsqrqhlci5"; - dependencies-sha256 = "1kzkna9ywacm2m1cirj9cyip66bgqjhid2xf9rrhq6g10lhr8j9m"; - tomcat = null; - maven = maven-jdk8; + hadoop_3_2 = common rec { + pname = "hadoop"; + version = "3.2.2"; + sha256 = "1hxq297cqvkfgz2yfdiwa3l28g44i2abv5921k2d6b4pqd33prwp"; + jdk = jdk8_headless; + # not using native libs because of broken openssl_1_0_2 dependency + # can be manually overriden + openssl = null; }; - hadoop_3_1 = common { - version = "3.1.1"; - sha256 = "04hhdbyd4x1hy0fpy537f8mi0864hww97zap29x7dk1smrffwabd"; - dependencies-sha256 = "1q63jsxg3d31x0p8hvhpvbly2b07almyzsbhwphbczl3fhlqgiwn"; - tomcat = null; - maven = maven-jdk8; + hadoop2 = common rec { + pname = "hadoop"; + version = "2.10.1"; + sha256 = "1w31x4bk9f2swnx8qxx0cgwfg8vbpm6cy5lvfnbbpl3rsjhmyg97"; + jdk = jdk8_headless; + openssl = null; }; } diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix index a92efb79584df35..e0329b9862b2c9a 100644 --- a/pkgs/top-level/all-packages.nix +++ b/pkgs/top-level/all-packages.nix @@ -13102,15 +13102,12 @@ with pkgs; groovy = callPackage ../development/interpreters/groovy { }; - inherit (callPackages ../applications/networking/cluster/hadoop { - jre = jre8; # TODO: remove override https://github.com/NixOS/nixpkgs/pull/89731 - }) - hadoop_2_7 - hadoop_2_8 - hadoop_2_9 - hadoop_3_0 - hadoop_3_1; - hadoop = hadoop_2_7; + inherit (callPackages ../applications/networking/cluster/hadoop { }) + hadoop_3_3 + hadoop_3_2 + hadoop2; + hadoop3 = hadoop_3_3; + hadoop = hadoop3; io = callPackage ../development/interpreters/io { }; @@ -13473,7 +13470,7 @@ with pkgs; self = pkgsi686Linux.callPackage ../development/interpreters/self { }; - inherit (callPackages ../applications/networking/cluster/spark { hadoop = hadoop_3_1; }) + inherit (callPackages ../applications/networking/cluster/spark { }) spark3 spark2; spark = spark3;