diff --git a/docs/asciidoc/modules/ROOT/partials/usage/apoc.load.html.adoc b/docs/asciidoc/modules/ROOT/partials/usage/apoc.load.html.adoc index 522220f3b8..ec8d6f165e 100644 --- a/docs/asciidoc/modules/ROOT/partials/usage/apoc.load.html.adoc +++ b/docs/asciidoc/modules/ROOT/partials/usage/apoc.load.html.adoc @@ -224,4 +224,66 @@ a| ] } ---- -|=== \ No newline at end of file +|=== + +If we have a `.html` file with a jQuery script like: + +[source,html] +---- + + + + + + +
+ + +---- + +we can read the generated js through the `browser` config. +Note that to use a browser, you have to install <>: + +[source,cypher] +---- +CALL apoc.load.html("test.html",{strong: "strong"}, {browser: "FIREFOX"}); +---- +.Results +[opts="header"] +|=== +| Output +a| +[source,json] +---- +{ + "strong": [ + { + "tagName": "strong", + "text": "This is a new text node" + } + ] +} +---- +|=== + +If we can parse a tag from a slow async call, we can use `wait` config to waiting for 10 second (in this example): + +[source,cypher] +---- +CALL apoc.load.html("test.html",{asyncTag: "#asyncTag"}, {browser: "FIREFOX", wait: 10}); +---- + +[[selenium-depencencies]] +== Dependencies + +To use the `apoc.load.html` proceduree with `browser` config (not `NONE`), you have to add additional dependencies. + +This dependency is included in https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/download/{apoc-release}/apoc-selenium-dependencies-{apoc-release}.jar[apoc-selenium-dependencies-{apoc-release}.jar^], which can be downloaded from the https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/tag/{apoc-release}[releases page^]. +Once that file is downloaded, it should be placed in the `plugins` directory and the Neo4j Server restarted. \ No newline at end of file diff --git a/docs/asciidoc/modules/ROOT/partials/usage/config/apoc.load.html.adoc b/docs/asciidoc/modules/ROOT/partials/usage/config/apoc.load.html.adoc index 4aa1f818dd..8c760508f6 100644 --- a/docs/asciidoc/modules/ROOT/partials/usage/config/apoc.load.html.adoc +++ b/docs/asciidoc/modules/ROOT/partials/usage/config/apoc.load.html.adoc @@ -1,10 +1,19 @@ The procedure support the following config parameters: .Config parameters -[opts=header] +[opts="header",cols="1m,2m,1m,4"] |=== | name | type | default | description -| charset | String | "UTF-8" | the character set of the page being scraped +| browser | Enum [NONE, CHROME, FIREFOX] | NONE | If it is set to "CHROME" or "FIREFOX", is used https://www.selenium.dev/documentation/en/webdriver/[Selenium Web Driver] to read the dynamically generated js. + In case it is "NONE" (default), it is not possible to read dynamic contents. + Note that to use the Chrome or Firefox driver, you need to have them installed on your machine and you have to download additional jars into the plugin folder. <> +| wait | long | 0 | If greater than 0, it waits until it finds at least one element for each of those entered in the query parameter + (up to a maximum of defined seconds, otherwise it continues execution). + Useful to handle elements which can be rendered after the page is loaded (i.e. slow asynchronous calls). +| charset | String | "UTF-8" | the character set of the page being scraped, if `http-equiv` meta-tag is not set. +| headless | boolean | true | Valid with `browser` not equal to `NONE`, allow to run browser in https://chromium.googlesource.com/chromium/src/+/lkgr/headless/README.md[headless mode], + that is without actually opening the browser UI (recommended). +| acceptInsecureCerts | boolean | true | If true, allow to read html from insecure certificates | baseUri | String | "" | Base URI used to resolve relative paths | failSilently | Enum [FALSE, WITH_LOG, WITH_LIST] | FALSE | If the parse fails with one or more elements, using `FALSE` it throws a `RuntimeException`, using `WITH_LOG` a `log.warn` is created for each incorrect item and using `WITH_LIST` an `errorList` key is added to the result with the failed tags. |=== \ No newline at end of file diff --git a/extra-dependencies/selenium/build.gradle b/extra-dependencies/selenium/build.gradle new file mode 100644 index 0000000000..dbed4a904f --- /dev/null +++ b/extra-dependencies/selenium/build.gradle @@ -0,0 +1,25 @@ +plugins { + id 'com.github.johnrengelman.shadow' version '4.0.3' +} + +java { + sourceCompatibility = JavaVersion.VERSION_11 + targetCompatibility = JavaVersion.VERSION_11 +} + +archivesBaseName = 'apoc-selenium-dependencies' +description = """APOC Selenium Dependencies""" + +jar { + manifest { + attributes 'Implementation-Version': version + } +} + +dependencies { + // currently we cannot update to the latest version due to guava minimum version required (31.0.1-jre) + compile group: 'org.seleniumhq.selenium', name: 'selenium-java', version: '3.141.59', { + exclude group: 'com.google.guava', module: 'guava' + } + compile group: 'io.github.bonigarcia', name: 'webdrivermanager', version: '4.4.3' +} diff --git a/extra-dependencies/selenium/gradle/wrapper/gradle-wrapper.jar b/extra-dependencies/selenium/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 0000000000..e69de29bb2 diff --git a/extra-dependencies/selenium/gradle/wrapper/gradle-wrapper.properties b/extra-dependencies/selenium/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 0000000000..e54747b5c7 --- /dev/null +++ b/extra-dependencies/selenium/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,6 @@ +#Tue Feb 06 14:27:44 CET 2018 +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-5.2.1-bin.zip \ No newline at end of file diff --git a/extra-dependencies/selenium/gradlew b/extra-dependencies/selenium/gradlew new file mode 100755 index 0000000000..2fe81a7d95 --- /dev/null +++ b/extra-dependencies/selenium/gradlew @@ -0,0 +1,183 @@ +#!/usr/bin/env sh + +# +# Copyright 2015 the original author or authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +############################################################################## +## +## Gradle start up script for UN*X +## +############################################################################## + +# Attempt to set APP_HOME +# Resolve links: $0 may be a link +PRG="$0" +# Need this for relative symlinks. +while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`"/$link" + fi +done +SAVED="`pwd`" +cd "`dirname \"$PRG\"`/" >/dev/null +APP_HOME="`pwd -P`" +cd "$SAVED" >/dev/null + +APP_NAME="Gradle" +APP_BASE_NAME=`basename "$0"` + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD="maximum" + +warn () { + echo "$*" +} + +die () { + echo + echo "$*" + echo + exit 1 +} + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "`uname`" in + CYGWIN* ) + cygwin=true + ;; + Darwin* ) + darwin=true + ;; + MINGW* ) + msys=true + ;; + NONSTOP* ) + nonstop=true + ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD="java" + which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." +fi + +# Increase the maximum file descriptors if we can. +if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then + MAX_FD_LIMIT=`ulimit -H -n` + if [ $? -eq 0 ] ; then + if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then + MAX_FD="$MAX_FD_LIMIT" + fi + ulimit -n $MAX_FD + if [ $? -ne 0 ] ; then + warn "Could not set maximum file descriptor limit: $MAX_FD" + fi + else + warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" + fi +fi + +# For Darwin, add options to specify how the application appears in the dock +if $darwin; then + GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" +fi + +# For Cygwin or MSYS, switch paths to Windows format before running java +if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then + APP_HOME=`cygpath --path --mixed "$APP_HOME"` + CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` + JAVACMD=`cygpath --unix "$JAVACMD"` + + # We build the pattern for arguments to be converted via cygpath + ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` + SEP="" + for dir in $ROOTDIRSRAW ; do + ROOTDIRS="$ROOTDIRS$SEP$dir" + SEP="|" + done + OURCYGPATTERN="(^($ROOTDIRS))" + # Add a user-defined pattern to the cygpath arguments + if [ "$GRADLE_CYGPATTERN" != "" ] ; then + OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" + fi + # Now convert the arguments - kludge to limit ourselves to /bin/sh + i=0 + for arg in "$@" ; do + CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` + CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option + + if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition + eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` + else + eval `echo args$i`="\"$arg\"" + fi + i=`expr $i + 1` + done + case $i in + 0) set -- ;; + 1) set -- "$args0" ;; + 2) set -- "$args0" "$args1" ;; + 3) set -- "$args0" "$args1" "$args2" ;; + 4) set -- "$args0" "$args1" "$args2" "$args3" ;; + 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; + 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; + 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; + 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; + 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; + esac +fi + +# Escape application args +save () { + for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done + echo " " +} +APP_ARGS=`save "$@"` + +# Collect all arguments for the java command, following the shell quoting and substitution rules +eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" + +exec "$JAVACMD" "$@" diff --git a/extra-dependencies/selenium/gradlew.bat b/extra-dependencies/selenium/gradlew.bat new file mode 100644 index 0000000000..9618d8d960 --- /dev/null +++ b/extra-dependencies/selenium/gradlew.bat @@ -0,0 +1,100 @@ +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto init + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto init + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:init +@rem Get command-line arguments, handling Windows variants + +if not "%OS%" == "Windows_NT" goto win9xME_args + +:win9xME_args +@rem Slurp the command line arguments. +set CMD_LINE_ARGS= +set _SKIP=2 + +:win9xME_args_slurp +if "x%~1" == "x" goto execute + +set CMD_LINE_ARGS=%* + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/extra-dependencies/selenium/settings.gradle b/extra-dependencies/selenium/settings.gradle new file mode 100644 index 0000000000..988ada53d1 --- /dev/null +++ b/extra-dependencies/selenium/settings.gradle @@ -0,0 +1,17 @@ + +pluginManagement { + repositories { +// mavenLocal() + maven { + url "https://neo.jfrog.io/neo/docs-maven" // System.env.ARTIFACTORY_URI +/* + credentials { + username System.env.ARTIFACTORY_USERNAME + password System.env.ARTIFACTORY_PASSWORD + } +*/ + } + gradlePluginPortal() + + } +} \ No newline at end of file diff --git a/extra-dependencies/settings.gradle b/extra-dependencies/settings.gradle index 48eea32187..db39a8b3a1 100644 --- a/extra-dependencies/settings.gradle +++ b/extra-dependencies/settings.gradle @@ -5,4 +5,5 @@ include('email') include('couchbase') include('xls') include('redis') +include('selenium') include('hadoop') \ No newline at end of file diff --git a/full/build.gradle b/full/build.gradle index 2f4c1185b1..6f82817b7b 100644 --- a/full/build.gradle +++ b/full/build.gradle @@ -66,6 +66,12 @@ dependencies { compile 'org.jsoup:jsoup:1.14.3' + // currently we cannot update selenium to the latest version due to guava minimum version required (31.0.1-jre) + compileOnly group: 'org.seleniumhq.selenium', name: 'selenium-java', version: '3.141.59', { exclude group: 'com.google.guava', module: 'guava' } + testCompile group: 'org.seleniumhq.selenium', name: 'selenium-java', version: '3.141.59', { exclude group: 'com.google.guava', module: 'guava' } + compileOnly group: 'io.github.bonigarcia', name: 'webdrivermanager', version: '5.1.0' + testCompile group: 'io.github.bonigarcia', name: 'webdrivermanager', version: '5.1.0' + compile group: 'org.roaringbitmap', name: 'RoaringBitmap', version: '0.7.17' compile(group: 'org.apache.commons', name: 'commons-configuration2', version: '2.7') { exclude group: "org.yaml" diff --git a/full/src/main/java/apoc/load/LoadHtml.java b/full/src/main/java/apoc/load/LoadHtml.java index e655fb9ad3..2a402d9417 100644 --- a/full/src/main/java/apoc/load/LoadHtml.java +++ b/full/src/main/java/apoc/load/LoadHtml.java @@ -2,6 +2,7 @@ import apoc.Extended; import apoc.result.MapResult; +import apoc.util.MissingDependencyException; import apoc.util.FileUtils; import apoc.util.Util; import org.jsoup.Jsoup; @@ -17,19 +18,25 @@ import org.neo4j.procedure.Procedure; import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; import java.io.UnsupportedEncodingException; - -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Supplier; import java.util.stream.Stream; +import static apoc.load.LoadHtmlBrowser.getChromeInputStream; +import static apoc.load.LoadHtmlBrowser.getFirefoxInputStream; + @Extended public class LoadHtml { // public for test purpose public static final String KEY_ERROR = "errorList"; - private enum FailSilently { FALSE, WITH_LOG, WITH_LIST } - @Context public GraphDatabaseService db; @@ -40,16 +47,13 @@ private enum FailSilently { FALSE, WITH_LOG, WITH_LIST } @Procedure @Description("apoc.load.html('url',{name: jquery, name2: jquery}, config) YIELD value - Load Html page and return the result as a Map") public Stream html(@Name("url") String url, @Name(value = "query",defaultValue = "{}") Map query, @Name(value = "config",defaultValue = "{}") Map config) { - return readHtmlPage(url, query, config); + return readHtmlPage(url, query, new LoadHtmlConfig(config)); } - private Stream readHtmlPage(String url, Map query, Map config) { - String charset = config.getOrDefault("charset", "UTF-8").toString(); + private Stream readHtmlPage(String url, Map query, LoadHtmlConfig config) { try { // baseUri is used to resolve relative paths - String baseUri = config.getOrDefault("baseUri", "").toString(); - - Document document = Jsoup.parse(FileUtils.inputStreamFor(url, null, null, null), charset, baseUri); + Document document = Jsoup.parse(getHtmlInputStream(url, query, config), config.getCharset(), config.getBaseUri()); Map output = new HashMap<>(); List errorList = new ArrayList<>(); @@ -62,19 +66,34 @@ private Stream readHtmlPage(String url, Map query, Ma output.put(KEY_ERROR, errorList); } - return Stream.of(new MapResult(output) ); + return Stream.of(new MapResult(output)); + } catch (IllegalArgumentException | ClassCastException e) { + throw new RuntimeException("Invalid config: " + config); } catch (FileNotFoundException e) { throw new RuntimeException("File not found from: " + url); } catch(UnsupportedEncodingException e) { - throw new RuntimeException("Unsupported charset: " + charset); + throw new RuntimeException("Unsupported charset: " + config.getCharset()); } catch(Exception e) { throw new RuntimeException("Can't read the HTML from: "+ url, e); } } + + private InputStream getHtmlInputStream(String url, Map query, LoadHtmlConfig config) throws IOException { + + final boolean isHeadless = config.isHeadless(); + final boolean isAcceptInsecureCerts = config.isAcceptInsecureCerts(); + switch (config.getBrowser()) { + case FIREFOX: + return withSeleniumBrowser(() -> getFirefoxInputStream(url, query, config, isHeadless, isAcceptInsecureCerts)); + case CHROME: + return withSeleniumBrowser(() -> getChromeInputStream(url, query, config, isHeadless, isAcceptInsecureCerts)); + default: + return FileUtils.inputStreamFor(url, null, null, null); + } + } - private List> getElements(Elements elements, Map config, List errorList) { + private List> getElements(Elements elements, LoadHtmlConfig conf, List errorList) { - FailSilently failConfig = FailSilently.valueOf((String) config.getOrDefault("failSilently", "FALSE")); List> elementList = new ArrayList<>(); for (Element element : elements) { @@ -85,10 +104,10 @@ private List> getElements(Elements elements, Map> getElements(Elements elements, Map getAttributes(Element element) { throw new RuntimeException("Invalid tag " + element); } if (!attribute.getValue().isBlank()) { - attributes.put(attribute.getKey(), attribute.getValue()); + final String key = attribute.getKey(); + // with href/src attribute we prepend baseUri path + final boolean attributeHasLink = key.equals("href") || key.equals("src"); + attributes.put(key, attributeHasLink ? element.absUrl(key) : attribute.getValue()); } } return attributes; } + private InputStream withSeleniumBrowser(Supplier action) { + try { + return action.get(); + } catch (NoClassDefFoundError e) { + throw new MissingDependencyException("Cannot find jars into the plugins folder.\n" + + "See the documentation: https://neo4j.com/labs/apoc/4.1/overview/apoc.load/apoc.load.html/#selenium-depencencies"); + } + } } \ No newline at end of file diff --git a/full/src/main/java/apoc/load/LoadHtmlBrowser.java b/full/src/main/java/apoc/load/LoadHtmlBrowser.java new file mode 100644 index 0000000000..6e3b7bbe7c --- /dev/null +++ b/full/src/main/java/apoc/load/LoadHtmlBrowser.java @@ -0,0 +1,53 @@ +package apoc.load; + +import io.github.bonigarcia.wdm.WebDriverManager; +import org.apache.commons.io.IOUtils; +import org.openqa.selenium.By; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +import org.openqa.selenium.firefox.FirefoxDriver; +import org.openqa.selenium.firefox.FirefoxOptions; +import org.openqa.selenium.support.ui.Wait; +import org.openqa.selenium.support.ui.WebDriverWait; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Map; + +public class LoadHtmlBrowser { + + public static InputStream getChromeInputStream(String url, Map query, LoadHtmlConfig config, boolean isHeadless, boolean isAcceptInsecureCerts) { + WebDriverManager.chromedriver().setup(); + ChromeOptions chromeOptions = new ChromeOptions(); + chromeOptions.setHeadless(isHeadless); + chromeOptions.setAcceptInsecureCerts(isAcceptInsecureCerts); + return getInputStreamWithBrowser(url, query, config, new ChromeDriver(chromeOptions)); + } + + public static InputStream getFirefoxInputStream(String url, Map query, LoadHtmlConfig config, boolean isHeadless, boolean isAcceptInsecureCerts) { + WebDriverManager.firefoxdriver().setup(); + FirefoxOptions firefoxOptions = new FirefoxOptions(); + firefoxOptions.setHeadless(isHeadless); + firefoxOptions.setAcceptInsecureCerts(isAcceptInsecureCerts); + return getInputStreamWithBrowser(url, query, config, new FirefoxDriver(firefoxOptions)); + } + + private static InputStream getInputStreamWithBrowser(String url, Map query, LoadHtmlConfig config, WebDriver driver) { + driver.get(url); + + final long wait = config.getWait(); + if (wait > 0) { + Wait driverWait = new WebDriverWait(driver, wait); + try { + driverWait.until(webDriver -> query.values().stream() + .noneMatch(selector -> webDriver.findElements(By.cssSelector(selector)).isEmpty())); + } catch (org.openqa.selenium.TimeoutException ignored) { + // We continue the execution even if 1 or more elements were not found + } + } + InputStream stream = IOUtils.toInputStream(driver.getPageSource(), config.getCharset()); + driver.close(); + return stream; + } +} diff --git a/full/src/main/java/apoc/load/LoadHtmlConfig.java b/full/src/main/java/apoc/load/LoadHtmlConfig.java new file mode 100644 index 0000000000..1f3e66d94a --- /dev/null +++ b/full/src/main/java/apoc/load/LoadHtmlConfig.java @@ -0,0 +1,67 @@ +package apoc.load; + +import apoc.util.Util; + +import java.util.Collections; +import java.util.Map; + +public class LoadHtmlConfig { + enum Browser { NONE, CHROME, FIREFOX } + enum FailSilently { FALSE, WITH_LOG, WITH_LIST } + + private final boolean headless; + private final boolean acceptInsecureCerts; + private final boolean children; + + private final String charset; + private final String baseUri; + + private final Browser browser; + private final FailSilently failSilently; + + private final long wait; + + public LoadHtmlConfig(Map config) { + if (config == null) config = Collections.emptyMap(); + this.headless = Util.toBoolean(config.getOrDefault("headless", true)); + this.acceptInsecureCerts = Util.toBoolean(config.getOrDefault("acceptInsecureCerts", true)); + this.children = Util.toBoolean(config.getOrDefault("children", false)); + this.charset = (String) config.getOrDefault("charset", "UTF-8"); + this.baseUri = (String) config.getOrDefault("baseUri", ""); + this.browser = Browser.valueOf((String) config.getOrDefault("browser", Browser.NONE.toString())); + this.failSilently = FailSilently.valueOf((String) config.getOrDefault("failSilently", FailSilently.FALSE.toString())); + this.wait = Util.toLong(config.getOrDefault("wait", 0)); + } + + public boolean isHeadless() { + return headless; + } + + public boolean isAcceptInsecureCerts() { + return acceptInsecureCerts; + } + + public boolean isChildren() { + return children; + } + + public String getCharset() { + return charset; + } + + public String getBaseUri() { + return baseUri; + } + + public Browser getBrowser() { + return browser; + } + + public FailSilently getFailSilently() { + return failSilently; + } + + public long getWait() { + return wait; + } +} diff --git a/full/src/test/java/apoc/load/LoadHtmlTest.java b/full/src/test/java/apoc/load/LoadHtmlTest.java index 6285ecf81e..a0b78b0168 100644 --- a/full/src/test/java/apoc/load/LoadHtmlTest.java +++ b/full/src/test/java/apoc/load/LoadHtmlTest.java @@ -14,6 +14,8 @@ import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; import static apoc.load.LoadHtml.KEY_ERROR; import static apoc.util.MapUtil.map; @@ -27,7 +29,7 @@ public class LoadHtmlTest { - private static final String RESULT_QUERY_METADATA = ("{attributes={charset=UTF-8}, tagName=meta}, " + + protected static final String RESULT_QUERY_METADATA = ("{attributes={charset=UTF-8}, tagName=meta}, " + "{attributes={name=ResourceLoaderDynamicStyles}, tagName=meta}, " + "{attributes={name=generator, content=MediaWiki 1.32.0-wmf.18}, tagName=meta}, " + "{attributes={name=referrer, content=origin}, tagName=meta}, " + @@ -35,7 +37,7 @@ public class LoadHtmlTest { "{attributes={name=referrer, content=origin-when-cross-origin}, tagName=meta}, " + "{attributes={property=og:image, content=https://upload.wikimedia.org/wikipedia/en/e/ea/Aap_Kaa_Hak_titles.jpg}, tagName=meta}"); - private static final String RESULT_QUERY_H2 = ("{text=Contents, tagName=h2}, " + + protected static final String RESULT_QUERY_H2 = ("{text=Contents, tagName=h2}, " + "{text=Origins[edit], tagName=h2}, " + "{text=Content[edit], tagName=h2}, " + "{text=Legacy[edit], tagName=h2}, " + @@ -43,7 +45,6 @@ public class LoadHtmlTest { "{text=Navigation menu, tagName=h2}"); private static final String INVALID_PATH = new File("src/test/resources/wikipedia1.html").getName(); - private static final String INVALID_PATH_ABSOLUTE = new File("src/test/resources/wikipedia1.html").getName(); private static final String VALID_PATH = new File("src/test/resources/wikipedia.html").toURI().toString(); private static final String INVALID_CHARSET = "notValid"; @@ -57,20 +58,56 @@ public void setup() { } @Test - public void testQueryAll(){ - Map query = map("metadata", "meta", "h2", "h2"); + public void testParseGeneratedJs() { + testCallGeneratedJsWithBrowser("FIREFOX"); + testCallGeneratedJsWithBrowser("CHROME"); + } - testResult(db, "CALL apoc.load.html($url,$query, $config)", map("url",new File("src/test/resources/wikipedia.html").toURI().toString(), "query", query, "config", Collections.emptyMap()), + @Test + public void testWithWaitUntilAndOneElementNotFound() { + testCall(db, "CALL apoc.load.html($url,$query,$config)", + map("url",new File("src/test/resources/html/wikipediaWithJs.html").toURI().toString(), + "query", map("elementExistent", "strong", "elementNotExistent", ".asdfgh"), + "config", map("browser", "CHROME", "wait", 5)), result -> { - Map row = result.next(); - Map value = (Map) row.get("value"); + Map value = (Map) result.get("value"); + List> notExistent = (List>) value.get("elementNotExistent"); + List> existent = (List>) value.get("elementExistent"); + assertTrue(notExistent.isEmpty()); + assertEquals(1, existent.size()); + final Map tag = existent.get(0); + assertEquals("This is a new text node", tag.get("text")); + assertEquals("strong", tag.get("tagName")); + }); + } - List> metadata = (List>) value.get("metadata"); - List> h2 = (List>) value.get("h2"); + @Test + public void testWithBaseUriConfig() { + Map query = map("urlTest", ".urlTest"); + + final String baseUri = new File("src/test/resources").toURI().toString(); + testCall(db, "CALL apoc.load.html($url,$query, $config)", + map("url", new File("src/test/resources/html/wikipediaWithJs.html").toURI().toString(), + "query", query, + "config", map("baseUri", baseUri)), + result -> { + Map value = (Map) result.get("value"); + final List> urlTestList = (List>) value.get("urlTest"); + Map absoluteUrlTag = map("tagName", "a", "text", "absoluteUrl", + "attributes", map("href", "https://foundation.wikimedia.org/wiki/Privacy_policy", "class", "urlTest")); - assertEquals(asList(RESULT_QUERY_METADATA).toString().trim(), metadata.toString().trim()); - assertEquals(asList(RESULT_QUERY_H2).toString().trim(), h2.toString().trim()); - }); + Map urlSameUrlTag = map("tagName", "a", "text", "urlSamePath", + "attributes", map("href", baseUri + "this.js", "class", "urlTest")); + + Map forwardUrlTag = map("tagName", "a", "text", "forwardUrl", + "attributes", map("href", "file:/test.js", "class", "urlTest")); + + Map backUrlTag = map("tagName", "a", "text", "backUrl", + "attributes", map("href", baseUri.replace("test/resources/", "backUrl.js"), "class", "urlTest")); + + final Set> expectedSetList = Set.of(absoluteUrlTag, urlSameUrlTag, forwardUrlTag, backUrlTag); + assertEquals(expectedSetList, Set.copyOf(urlTestList)); + }); } @Test @@ -97,45 +134,6 @@ public void testQueryH2(){ }); } - @Test - public void testQueryH2WithConfig(){ - Map query = map("h2", "h2"); - Map config = map("charset", "UTF-8", "baserUri", ""); - - testResult(db, "CALL apoc.load.html($url,$query, $config)", map("url",new File("src/test/resources/wikipedia.html").toURI().toString(), "query", query, "config", config), - result -> { - Map row = result.next(); - assertEquals(map("h2",asList(RESULT_QUERY_H2)).toString().trim(), row.get("value").toString().trim()); - assertFalse(result.hasNext()); - }); - } - - @Test - public void testQueryWithChildren() { - Map query = map("toc", ".toc ul"); - Map config = map("children", true); - - testResult(db, "CALL apoc.load.html($url,$query, $config)", map("url",new File("src/test/resources/wikipedia.html").toURI().toString(), "query", query, "config", config), - result -> { - Map row = result.next(); - Map value = (Map) row.get("value"); - - List> toc = (List) value.get("toc"); - Map first = toc.get(0); - - // Should be
    - assertEquals("ul", first.get("tagName")); - - // Should have four children - assertEquals(4, ((List) first.get("children")).size()); - - Map firstChild = (Map)((List) first.get("children")).get(0); - - assertEquals("li", firstChild.get("tagName")); - assertEquals(1, ((List) firstChild.get("children")).size()); - }); - } - @Test public void testQueryWithFailsSilentlyWithLog() { Map query = map("a", "a", "invalid", "invalid", "h6", "h6"); @@ -214,12 +212,12 @@ public void testQueryWithExceptionIfIncorrectUrl() { @Test(expected = QueryExecutionException.class) public void testQueryWithFailsSilentlyWithLogWithExceptionIfIncorrectUrl() { - testIncorrectUrl("CALL apoc.load.html('" + INVALID_PATH + "',{failSilently: 'WITH_LOG', a:'a'})"); + testIncorrectUrl("CALL apoc.load.html('" + INVALID_PATH + "',{a:'a'}, {failSilently: 'WITH_LOG'})"); } @Test(expected = QueryExecutionException.class) public void testQueryWithFailsSilentlyWithListWithExceptionIfIncorrectUrl() { - testIncorrectUrl("CALL apoc.load.html('" + INVALID_PATH + "',{failSilently: 'WITH_LIST', a:'a'})"); + testIncorrectUrl("CALL apoc.load.html('" + INVALID_PATH + "',{a:'a'}, {failSilently: 'WITH_LIST'})"); } @Test(expected = QueryExecutionException.class) @@ -237,6 +235,20 @@ public void testQueryWithFailsSilentlyWithListWithExceptionIfIncorrectCharset() testIncorrectCharset("CALL apoc.load.html('" + VALID_PATH + "',{a:'a'}, {failSilently: 'WITH_LIST', charset: '" + INVALID_CHARSET + "'})"); } + @Test(expected = QueryExecutionException.class) + public void testFailsWithIncorrectBrowser() { + final String invalidValue = "NOT_VALID"; + final Map config = Map.of("browser", invalidValue); + try { + testCall(db, "CALL apoc.load.html('" + VALID_PATH + "',{a:'a'}, $config)", Map.of("config", config), (r) -> {}); + } catch (Exception e) { + Throwable except = ExceptionUtils.getRootCause(e); + String expectedMessage = "No enum constant " + LoadHtmlConfig.Browser.class.getCanonicalName() + "." + invalidValue; + assertEquals(expectedMessage, except.getMessage()); + throw e; + } + } + private void testIncorrectCharset(String query) { try { testCall(db, query, (r) -> {}); @@ -259,4 +271,27 @@ private void testIncorrectUrl(String query) { throw e; } } + + private void testCallGeneratedJsWithBrowser(String browser) { + testCall(db, "CALL apoc.load.html($url,$query,$config)", + map("url",new File("src/test/resources/html/wikipediaWithJs.html").toURI().toString(), + "query", map("td", "td", "strong", "strong"), + "config", map("browser", browser)), + result -> { + Map value = (Map) result.get("value"); + List> tdList = (List>) value.get("td"); + List> strongList = (List>) value.get("strong"); + assertEquals(4, tdList.size()); + final String templateString = "foo bar - baz"; + AtomicInteger integer = new AtomicInteger(); + tdList.forEach(tag -> { + assertEquals("td", tag.get("tagName")); + assertEquals(integer.getAndIncrement() + templateString, tag.get("text")); + }); + assertEquals(1, strongList.size()); + final Map tagStrong = strongList.get(0); + assertEquals("This is a new text node", tagStrong.get("text")); + assertEquals("strong", tagStrong.get("tagName")); + }); + } } diff --git a/full/src/test/java/apoc/load/LoadHtmlTestParameterized.java b/full/src/test/java/apoc/load/LoadHtmlTestParameterized.java new file mode 100644 index 0000000000..7c3bb92c77 --- /dev/null +++ b/full/src/test/java/apoc/load/LoadHtmlTestParameterized.java @@ -0,0 +1,127 @@ +package apoc.load; + +import apoc.ApocSettings; +import apoc.util.TestUtil; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameter; +import org.junit.runners.Parameterized.Parameters; +import org.neo4j.test.rule.DbmsRule; +import org.neo4j.test.rule.ImpermanentDbmsRule; + +import java.io.File; +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import static apoc.load.LoadHtmlTest.RESULT_QUERY_H2; +import static apoc.load.LoadHtmlTest.RESULT_QUERY_METADATA; +import static apoc.util.MapUtil.map; +import static apoc.util.TestUtil.testResult; +import static com.google.common.collect.Lists.newArrayList; +import static java.util.Arrays.asList; +import static java.util.Collections.emptyMap; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +@RunWith(Parameterized.class) +public class LoadHtmlTestParameterized { + // Tests taken from LoadHtmlTest.java. + // To check that `browser` configuration preserve the result. + + @Rule + public DbmsRule db = new ImpermanentDbmsRule() + .withSetting(ApocSettings.apoc_import_file_enabled, true); + + @Before + public void setup() { + TestUtil.registerProcedure(db, LoadHtml.class); + } + + + @Parameters + public static Collection data() { + return List.of("notSet", "NONE", "CHROME", "FIREFOX"); + } + + @Parameter + public String browser; + + + @Test + public void testQueryAll() { + Map query = map("metadata", "meta", "h2", "h2"); + + Map config = browserSet() ? Map.of("browser", browser) : emptyMap(); + testResult(db, "CALL apoc.load.html($url,$query, $config)", + map("url",new File("src/test/resources/wikipedia.html").toURI().toString(), "query", query, "config", config), + result -> { + Map row = result.next(); + Map value = (Map) row.get("value"); + + List> metadata = (List>) value.get("metadata"); + List> h2 = (List>) value.get("h2"); + + assertEquals(asList(RESULT_QUERY_METADATA).toString().trim(), metadata.toString().trim()); + assertEquals(asList(RESULT_QUERY_H2).toString().trim(), h2.toString().trim()); + }); + } + + @Test + public void testQueryH2WithConfig() { + Map query = map("h2", "h2"); + final List confList = newArrayList("charset", "UTF-8", "baseUri", ""); + addBrowserIfSet(confList); + Map config = map(confList.toArray()); + + testResult(db, "CALL apoc.load.html($url, $query, $config)", + map("url",new File("src/test/resources/wikipedia.html").toURI().toString(), "query", query, "config", config), + result -> { + Map row = result.next(); + assertEquals(map("h2",asList(RESULT_QUERY_H2)).toString().trim(), row.get("value").toString().trim()); + assertFalse(result.hasNext()); + }); + } + + @Test + public void testQueryWithChildren() { + Map query = map("toc", ".toc ul"); + final List confList = newArrayList("children", true); + addBrowserIfSet(confList); + Map config = map(confList.toArray()); + + testResult(db, "CALL apoc.load.html($url, $query, $config)", + map("url",new File("src/test/resources/wikipedia.html").toURI().toString(), "query", query, "config", config), + result -> { + Map row = result.next(); + Map value = (Map) row.get("value"); + + List> toc = (List) value.get("toc"); + Map first = toc.get(0); + + // Should be
      + assertEquals("ul", first.get("tagName")); + + // Should have four children + assertEquals(4, ((List) first.get("children")).size()); + + Map firstChild = (Map)((List) first.get("children")).get(0); + + assertEquals("li", firstChild.get("tagName")); + assertEquals(1, ((List) firstChild.get("children")).size()); + }); + } + + private void addBrowserIfSet(List confList) { + if (browserSet()) { + confList.addAll(List.of("browser", browser)); + } + } + + private boolean browserSet() { + return !browser.equals("notSet"); + } +} diff --git a/full/src/test/resources/html/wikipediaWithJs.html b/full/src/test/resources/html/wikipediaWithJs.html new file mode 100644 index 0000000000..503d57e7c3 --- /dev/null +++ b/full/src/test/resources/html/wikipediaWithJs.html @@ -0,0 +1,37 @@ + + + + + + + + + + +Aap Kaa Hak - Wikipedia + + +
      +
      + + + + + +
      col1col2
      +
      + + + diff --git a/full/src/test/resources/loadData.js b/full/src/test/resources/loadData.js new file mode 100644 index 0000000000..e291a3e176 --- /dev/null +++ b/full/src/test/resources/loadData.js @@ -0,0 +1,17 @@ +// append rows and cols to table.data in page.html +function loadData() { + data = document.getElementById("data"); + let index = 0; + for (let row = 0; row < 2; row++) { + let tr = document.createElement("tr"); + for (let col = 0; col < 2; col++) { + td = document.createElement("td"); + td.appendChild(document.createTextNode(index + `foo + bar - + baz`)); + tr.appendChild(td); + index++; + } + data.appendChild(tr); + } +} diff --git a/full/src/test/resources/wikipediaWithJs.html b/full/src/test/resources/wikipediaWithJs.html new file mode 100644 index 0000000000..93d096bac8 --- /dev/null +++ b/full/src/test/resources/wikipediaWithJs.html @@ -0,0 +1,46 @@ + + + + + + + +Aap Kaa Hak - Wikipedia + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + + + + + +
      col1col2
      +
      + +