From 4a703af7d3194040bd0eaef01c0ed72a9914ee03 Mon Sep 17 00:00:00 2001 From: Andrea Santurbano Date: Wed, 16 Jun 2021 12:34:54 +0200 Subject: [PATCH] Revert "Fixes #1372: apoc.load.html ability to read runtime structure of the page" (#2012) This reverts commit 530a58c63be0ad7ec8bb3356425269e79d65ec21. --- .../ROOT/partials/usage/apoc.load.html.adoc | 64 +----- .../partials/usage/config/apoc.load.html.adoc | 13 +- extra-dependencies/selenium/build.gradle | 22 --- .../gradle/wrapper/gradle-wrapper.jar | 0 .../gradle/wrapper/gradle-wrapper.properties | 6 - extra-dependencies/selenium/gradlew | 183 ------------------ extra-dependencies/selenium/gradlew.bat | 100 ---------- extra-dependencies/selenium/settings.gradle | 17 -- extra-dependencies/settings.gradle | 3 +- full/build.gradle | 5 - full/src/main/java/apoc/load/LoadHtml.java | 68 ++----- .../main/java/apoc/load/LoadHtmlBrowser.java | 58 ------ .../main/java/apoc/load/LoadHtmlConfig.java | 67 ------- .../src/test/java/apoc/load/LoadHtmlTest.java | 149 ++++++-------- .../apoc/load/LoadHtmlTestParameterized.java | 125 ------------ .../test/resources/html/wikipediaWithJs.html | 37 ---- full/src/test/resources/loadData.js | 17 -- full/src/test/resources/wikipedia.html | 2 +- full/src/test/resources/wikipediaWithJs.html | 46 ----- 19 files changed, 79 insertions(+), 903 deletions(-) delete mode 100644 extra-dependencies/selenium/build.gradle delete mode 100644 extra-dependencies/selenium/gradle/wrapper/gradle-wrapper.jar delete mode 100644 extra-dependencies/selenium/gradle/wrapper/gradle-wrapper.properties delete mode 100755 extra-dependencies/selenium/gradlew delete mode 100644 extra-dependencies/selenium/gradlew.bat delete mode 100644 extra-dependencies/selenium/settings.gradle delete mode 100644 full/src/main/java/apoc/load/LoadHtmlBrowser.java delete mode 100644 full/src/main/java/apoc/load/LoadHtmlConfig.java delete mode 100644 full/src/test/java/apoc/load/LoadHtmlTestParameterized.java delete mode 100644 full/src/test/resources/html/wikipediaWithJs.html delete mode 100644 full/src/test/resources/loadData.js delete mode 100644 full/src/test/resources/wikipediaWithJs.html diff --git a/docs/asciidoc/modules/ROOT/partials/usage/apoc.load.html.adoc b/docs/asciidoc/modules/ROOT/partials/usage/apoc.load.html.adoc index ec8d6f165e..522220f3b8 100644 --- a/docs/asciidoc/modules/ROOT/partials/usage/apoc.load.html.adoc +++ b/docs/asciidoc/modules/ROOT/partials/usage/apoc.load.html.adoc @@ -224,66 +224,4 @@ a| ] } ---- -|=== - -If we have a `.html` file with a jQuery script like: - -[source,html] ----- - - - - - - -
- - ----- - -we can read the generated js through the `browser` config. -Note that to use a browser, you have to install <>: - -[source,cypher] ----- -CALL apoc.load.html("test.html",{strong: "strong"}, {browser: "FIREFOX"}); ----- -.Results -[opts="header"] -|=== -| Output -a| -[source,json] ----- -{ - "strong": [ - { - "tagName": "strong", - "text": "This is a new text node" - } - ] -} ----- -|=== - -If we can parse a tag from a slow async call, we can use `wait` config to waiting for 10 second (in this example): - -[source,cypher] ----- -CALL apoc.load.html("test.html",{asyncTag: "#asyncTag"}, {browser: "FIREFOX", wait: 10}); ----- - -[[selenium-depencencies]] -== Dependencies - -To use the `apoc.load.html` proceduree with `browser` config (not `NONE`), you have to add additional dependencies. - -This dependency is included in https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/download/{apoc-release}/apoc-selenium-dependencies-{apoc-release}.jar[apoc-selenium-dependencies-{apoc-release}.jar^], which can be downloaded from the https://github.com/neo4j-contrib/neo4j-apoc-procedures/releases/tag/{apoc-release}[releases page^]. -Once that file is downloaded, it should be placed in the `plugins` directory and the Neo4j Server restarted. \ No newline at end of file +|=== \ No newline at end of file diff --git a/docs/asciidoc/modules/ROOT/partials/usage/config/apoc.load.html.adoc b/docs/asciidoc/modules/ROOT/partials/usage/config/apoc.load.html.adoc index 8c760508f6..4aa1f818dd 100644 --- a/docs/asciidoc/modules/ROOT/partials/usage/config/apoc.load.html.adoc +++ b/docs/asciidoc/modules/ROOT/partials/usage/config/apoc.load.html.adoc @@ -1,19 +1,10 @@ The procedure support the following config parameters: .Config parameters -[opts="header",cols="1m,2m,1m,4"] +[opts=header] |=== | name | type | default | description -| browser | Enum [NONE, CHROME, FIREFOX] | NONE | If it is set to "CHROME" or "FIREFOX", is used https://www.selenium.dev/documentation/en/webdriver/[Selenium Web Driver] to read the dynamically generated js. - In case it is "NONE" (default), it is not possible to read dynamic contents. - Note that to use the Chrome or Firefox driver, you need to have them installed on your machine and you have to download additional jars into the plugin folder. <> -| wait | long | 0 | If greater than 0, it waits until it finds at least one element for each of those entered in the query parameter - (up to a maximum of defined seconds, otherwise it continues execution). - Useful to handle elements which can be rendered after the page is loaded (i.e. slow asynchronous calls). -| charset | String | "UTF-8" | the character set of the page being scraped, if `http-equiv` meta-tag is not set. -| headless | boolean | true | Valid with `browser` not equal to `NONE`, allow to run browser in https://chromium.googlesource.com/chromium/src/+/lkgr/headless/README.md[headless mode], - that is without actually opening the browser UI (recommended). -| acceptInsecureCerts | boolean | true | If true, allow to read html from insecure certificates +| charset | String | "UTF-8" | the character set of the page being scraped | baseUri | String | "" | Base URI used to resolve relative paths | failSilently | Enum [FALSE, WITH_LOG, WITH_LIST] | FALSE | If the parse fails with one or more elements, using `FALSE` it throws a `RuntimeException`, using `WITH_LOG` a `log.warn` is created for each incorrect item and using `WITH_LIST` an `errorList` key is added to the result with the failed tags. |=== \ No newline at end of file diff --git a/extra-dependencies/selenium/build.gradle b/extra-dependencies/selenium/build.gradle deleted file mode 100644 index 99661e4548..0000000000 --- a/extra-dependencies/selenium/build.gradle +++ /dev/null @@ -1,22 +0,0 @@ -plugins { - id 'com.github.johnrengelman.shadow' version '4.0.3' -} - -java { - sourceCompatibility = JavaVersion.VERSION_11 - targetCompatibility = JavaVersion.VERSION_11 -} - -archivesBaseName = 'apoc-selenium-dependencies' -description = """APOC Selenium Dependencies""" - -jar { - manifest { - attributes 'Implementation-Version': version - } -} - -dependencies { - compile group: 'org.seleniumhq.selenium', name: 'selenium-java', version: '3.141.59' - compile group: 'io.github.bonigarcia', name: 'webdrivermanager', version: '4.4.3' -} diff --git a/extra-dependencies/selenium/gradle/wrapper/gradle-wrapper.jar b/extra-dependencies/selenium/gradle/wrapper/gradle-wrapper.jar deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/extra-dependencies/selenium/gradle/wrapper/gradle-wrapper.properties b/extra-dependencies/selenium/gradle/wrapper/gradle-wrapper.properties deleted file mode 100644 index e54747b5c7..0000000000 --- a/extra-dependencies/selenium/gradle/wrapper/gradle-wrapper.properties +++ /dev/null @@ -1,6 +0,0 @@ -#Tue Feb 06 14:27:44 CET 2018 -distributionBase=GRADLE_USER_HOME -distributionPath=wrapper/dists -zipStoreBase=GRADLE_USER_HOME -zipStorePath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-5.2.1-bin.zip \ No newline at end of file diff --git a/extra-dependencies/selenium/gradlew b/extra-dependencies/selenium/gradlew deleted file mode 100755 index 2fe81a7d95..0000000000 --- a/extra-dependencies/selenium/gradlew +++ /dev/null @@ -1,183 +0,0 @@ -#!/usr/bin/env sh - -# -# Copyright 2015 the original author or authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -############################################################################## -## -## Gradle start up script for UN*X -## -############################################################################## - -# Attempt to set APP_HOME -# Resolve links: $0 may be a link -PRG="$0" -# Need this for relative symlinks. -while [ -h "$PRG" ] ; do - ls=`ls -ld "$PRG"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '/.*' > /dev/null; then - PRG="$link" - else - PRG=`dirname "$PRG"`"/$link" - fi -done -SAVED="`pwd`" -cd "`dirname \"$PRG\"`/" >/dev/null -APP_HOME="`pwd -P`" -cd "$SAVED" >/dev/null - -APP_NAME="Gradle" -APP_BASE_NAME=`basename "$0"` - -# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' - -# Use the maximum available, or set MAX_FD != -1 to use that value. -MAX_FD="maximum" - -warn () { - echo "$*" -} - -die () { - echo - echo "$*" - echo - exit 1 -} - -# OS specific support (must be 'true' or 'false'). -cygwin=false -msys=false -darwin=false -nonstop=false -case "`uname`" in - CYGWIN* ) - cygwin=true - ;; - Darwin* ) - darwin=true - ;; - MINGW* ) - msys=true - ;; - NONSTOP* ) - nonstop=true - ;; -esac - -CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar - -# Determine the Java command to use to start the JVM. -if [ -n "$JAVA_HOME" ] ; then - if [ -x "$JAVA_HOME/jre/sh/java" ] ; then - # IBM's JDK on AIX uses strange locations for the executables - JAVACMD="$JAVA_HOME/jre/sh/java" - else - JAVACMD="$JAVA_HOME/bin/java" - fi - if [ ! -x "$JAVACMD" ] ; then - die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME - -Please set the JAVA_HOME variable in your environment to match the -location of your Java installation." - fi -else - JAVACMD="java" - which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. - -Please set the JAVA_HOME variable in your environment to match the -location of your Java installation." -fi - -# Increase the maximum file descriptors if we can. -if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then - MAX_FD_LIMIT=`ulimit -H -n` - if [ $? -eq 0 ] ; then - if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then - MAX_FD="$MAX_FD_LIMIT" - fi - ulimit -n $MAX_FD - if [ $? -ne 0 ] ; then - warn "Could not set maximum file descriptor limit: $MAX_FD" - fi - else - warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" - fi -fi - -# For Darwin, add options to specify how the application appears in the dock -if $darwin; then - GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" -fi - -# For Cygwin or MSYS, switch paths to Windows format before running java -if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then - APP_HOME=`cygpath --path --mixed "$APP_HOME"` - CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` - JAVACMD=`cygpath --unix "$JAVACMD"` - - # We build the pattern for arguments to be converted via cygpath - ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` - SEP="" - for dir in $ROOTDIRSRAW ; do - ROOTDIRS="$ROOTDIRS$SEP$dir" - SEP="|" - done - OURCYGPATTERN="(^($ROOTDIRS))" - # Add a user-defined pattern to the cygpath arguments - if [ "$GRADLE_CYGPATTERN" != "" ] ; then - OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" - fi - # Now convert the arguments - kludge to limit ourselves to /bin/sh - i=0 - for arg in "$@" ; do - CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` - CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option - - if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition - eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` - else - eval `echo args$i`="\"$arg\"" - fi - i=`expr $i + 1` - done - case $i in - 0) set -- ;; - 1) set -- "$args0" ;; - 2) set -- "$args0" "$args1" ;; - 3) set -- "$args0" "$args1" "$args2" ;; - 4) set -- "$args0" "$args1" "$args2" "$args3" ;; - 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; - 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; - 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; - 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; - 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; - esac -fi - -# Escape application args -save () { - for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done - echo " " -} -APP_ARGS=`save "$@"` - -# Collect all arguments for the java command, following the shell quoting and substitution rules -eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" - -exec "$JAVACMD" "$@" diff --git a/extra-dependencies/selenium/gradlew.bat b/extra-dependencies/selenium/gradlew.bat deleted file mode 100644 index 9618d8d960..0000000000 --- a/extra-dependencies/selenium/gradlew.bat +++ /dev/null @@ -1,100 +0,0 @@ -@rem -@rem Copyright 2015 the original author or authors. -@rem -@rem Licensed under the Apache License, Version 2.0 (the "License"); -@rem you may not use this file except in compliance with the License. -@rem You may obtain a copy of the License at -@rem -@rem https://www.apache.org/licenses/LICENSE-2.0 -@rem -@rem Unless required by applicable law or agreed to in writing, software -@rem distributed under the License is distributed on an "AS IS" BASIS, -@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -@rem See the License for the specific language governing permissions and -@rem limitations under the License. -@rem - -@if "%DEBUG%" == "" @echo off -@rem ########################################################################## -@rem -@rem Gradle startup script for Windows -@rem -@rem ########################################################################## - -@rem Set local scope for the variables with windows NT shell -if "%OS%"=="Windows_NT" setlocal - -set DIRNAME=%~dp0 -if "%DIRNAME%" == "" set DIRNAME=. -set APP_BASE_NAME=%~n0 -set APP_HOME=%DIRNAME% - -@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" - -@rem Find java.exe -if defined JAVA_HOME goto findJavaFromJavaHome - -set JAVA_EXE=java.exe -%JAVA_EXE% -version >NUL 2>&1 -if "%ERRORLEVEL%" == "0" goto init - -echo. -echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:findJavaFromJavaHome -set JAVA_HOME=%JAVA_HOME:"=% -set JAVA_EXE=%JAVA_HOME%/bin/java.exe - -if exist "%JAVA_EXE%" goto init - -echo. -echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:init -@rem Get command-line arguments, handling Windows variants - -if not "%OS%" == "Windows_NT" goto win9xME_args - -:win9xME_args -@rem Slurp the command line arguments. -set CMD_LINE_ARGS= -set _SKIP=2 - -:win9xME_args_slurp -if "x%~1" == "x" goto execute - -set CMD_LINE_ARGS=%* - -:execute -@rem Setup the command line - -set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar - -@rem Execute Gradle -"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% - -:end -@rem End local scope for the variables with windows NT shell -if "%ERRORLEVEL%"=="0" goto mainEnd - -:fail -rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of -rem the _cmd.exe /c_ return code! -if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 -exit /b 1 - -:mainEnd -if "%OS%"=="Windows_NT" endlocal - -:omega diff --git a/extra-dependencies/selenium/settings.gradle b/extra-dependencies/selenium/settings.gradle deleted file mode 100644 index 988ada53d1..0000000000 --- a/extra-dependencies/selenium/settings.gradle +++ /dev/null @@ -1,17 +0,0 @@ - -pluginManagement { - repositories { -// mavenLocal() - maven { - url "https://neo.jfrog.io/neo/docs-maven" // System.env.ARTIFACTORY_URI -/* - credentials { - username System.env.ARTIFACTORY_USERNAME - password System.env.ARTIFACTORY_PASSWORD - } -*/ - } - gradlePluginPortal() - - } -} \ No newline at end of file diff --git a/extra-dependencies/settings.gradle b/extra-dependencies/settings.gradle index 7e58e733b1..b8d4f10bf7 100644 --- a/extra-dependencies/settings.gradle +++ b/extra-dependencies/settings.gradle @@ -3,5 +3,4 @@ include('nlp') include('mongodb') include('email') include('couchbase') -include('xls') -include('selenium') \ No newline at end of file +include('xls') \ No newline at end of file diff --git a/full/build.gradle b/full/build.gradle index 2bcba2982b..3286340a86 100644 --- a/full/build.gradle +++ b/full/build.gradle @@ -68,11 +68,6 @@ dependencies { compile 'org.jsoup:jsoup:1.11.3' - compileOnly group: 'org.seleniumhq.selenium', name: 'selenium-java', version: '3.141.59' - testCompile group: 'org.seleniumhq.selenium', name: 'selenium-java', version: '3.141.59' - compileOnly group: 'io.github.bonigarcia', name: 'webdrivermanager', version: '3.1.1' - testCompile group: 'io.github.bonigarcia', name: 'webdrivermanager', version: '3.1.1' - compile group: 'org.roaringbitmap', name: 'RoaringBitmap', version: '0.7.17' compile(group: 'org.apache.commons', name: 'commons-configuration2', version: '2.7') { exclude group: "org.yaml" diff --git a/full/src/main/java/apoc/load/LoadHtml.java b/full/src/main/java/apoc/load/LoadHtml.java index 0d86016938..8ffaaf1925 100644 --- a/full/src/main/java/apoc/load/LoadHtml.java +++ b/full/src/main/java/apoc/load/LoadHtml.java @@ -2,7 +2,6 @@ import apoc.Extended; import apoc.result.MapResult; -import apoc.util.MissingDependencyException; import apoc.util.Util; import org.jsoup.Jsoup; import org.jsoup.nodes.Attribute; @@ -17,18 +16,10 @@ import org.neo4j.procedure.Procedure; import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.InputStream; import java.io.UnsupportedEncodingException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.function.Supplier; -import java.util.stream.Stream; -import static apoc.load.LoadHtmlBrowser.getChromeInputStream; -import static apoc.load.LoadHtmlBrowser.getFirefoxInputStream; +import java.util.*; +import java.util.stream.Stream; @Extended public class LoadHtml { @@ -36,6 +27,8 @@ public class LoadHtml { // public for test purpose public static final String KEY_ERROR = "errorList"; + private enum FailSilently { FALSE, WITH_LOG, WITH_LIST } + @Context public GraphDatabaseService db; @@ -46,13 +39,16 @@ public class LoadHtml { @Procedure @Description("apoc.load.html('url',{name: jquery, name2: jquery}, config) YIELD value - Load Html page and return the result as a Map") public Stream html(@Name("url") String url, @Name(value = "query",defaultValue = "{}") Map query, @Name(value = "config",defaultValue = "{}") Map config) { - return readHtmlPage(url, query, new LoadHtmlConfig(config)); + return readHtmlPage(url, query, config); } - private Stream readHtmlPage(String url, Map query, LoadHtmlConfig config) { + private Stream readHtmlPage(String url, Map query, Map config) { + String charset = config.getOrDefault("charset", "UTF-8").toString(); try { // baseUri is used to resolve relative paths - Document document = Jsoup.parse(getHtmlInputStream(url, query, config), config.getCharset(), config.getBaseUri()); + String baseUri = config.getOrDefault("baseUri", "").toString(); + + Document document = Jsoup.parse(Util.openInputStream(url, null, null), charset, baseUri); Map output = new HashMap<>(); List errorList = new ArrayList<>(); @@ -65,34 +61,19 @@ private Stream readHtmlPage(String url, Map query, Lo output.put(KEY_ERROR, errorList); } - return Stream.of(new MapResult(output)); - } catch (IllegalArgumentException | ClassCastException e) { - throw new RuntimeException("Invalid config: " + config); + return Stream.of(new MapResult(output) ); } catch (FileNotFoundException e) { throw new RuntimeException("File not found from: " + url); } catch(UnsupportedEncodingException e) { - throw new RuntimeException("Unsupported charset: " + config.getCharset()); + throw new RuntimeException("Unsupported charset: " + charset); } catch(Exception e) { throw new RuntimeException("Can't read the HTML from: "+ url, e); } } - - private InputStream getHtmlInputStream(String url, Map query, LoadHtmlConfig config) throws IOException { - - final boolean isHeadless = config.isHeadless(); - final boolean isAcceptInsecureCerts = config.isAcceptInsecureCerts(); - switch (config.getBrowser()) { - case FIREFOX: - return withSeleniumBrowser(() -> getFirefoxInputStream(url, query, config, isHeadless, isAcceptInsecureCerts)); - case CHROME: - return withSeleniumBrowser(() -> getChromeInputStream(url, query, config, isHeadless, isAcceptInsecureCerts)); - default: - return Util.openInputStream(url, null, null); - } - } - private List> getElements(Elements elements, LoadHtmlConfig conf, List errorList) { + private List> getElements(Elements elements, Map config, List errorList) { + FailSilently failConfig = FailSilently.valueOf((String) config.getOrDefault("failSilently", "FALSE")); List> elementList = new ArrayList<>(); for (Element element : elements) { @@ -103,10 +84,10 @@ private List> getElements(Elements elements, LoadHtmlConfig if(!element.val().isEmpty()) result.put("value", element.val()); if(!element.tagName().isEmpty()) result.put("tagName", element.tagName()); - if (conf.isChildren()) { + if (Util.toBoolean(config.getOrDefault("children", false))) { if(element.hasText()) result.put("text", element.ownText()); - result.put("children", getElements(element.children(), conf, errorList)); + result.put("children", getElements(element.children(), config, errorList)); } else { if(element.hasText()) result.put("text", element.text()); @@ -115,7 +96,7 @@ private List> getElements(Elements elements, LoadHtmlConfig elementList.add(result); } catch (Exception e) { final String parseError = "Error during parsing element: " + element; - switch (conf.getFailSilently()) { + switch (failConfig) { case WITH_LOG: log.warn(parseError); break; @@ -134,24 +115,11 @@ private List> getElements(Elements elements, LoadHtmlConfig private Map getAttributes(Element element) { Map attributes = new HashMap<>(); for (Attribute attribute : element.attributes()) { - if(!attribute.getValue().isEmpty()) { - final String key = attribute.getKey(); - // with href/src attribute we prepend baseUri path - final boolean attributeHasLink = key.equals("href") || key.equals("src"); - attributes.put(key, attributeHasLink ? element.absUrl(key) : attribute.getValue()); - } + if(!attribute.getValue().isEmpty()) attributes.put(attribute.getKey(), attribute.getValue()); } return attributes; } - private InputStream withSeleniumBrowser(Supplier action) { - try { - return action.get(); - } catch (NoClassDefFoundError e) { - throw new MissingDependencyException("Cannot find jars into the plugins folder.\n" + - "See the documentation: https://neo4j.com/labs/apoc/4.1/overview/apoc.load/apoc.load.html/#selenium-depencencies"); - } - } } \ No newline at end of file diff --git a/full/src/main/java/apoc/load/LoadHtmlBrowser.java b/full/src/main/java/apoc/load/LoadHtmlBrowser.java deleted file mode 100644 index 0e5bb25e84..0000000000 --- a/full/src/main/java/apoc/load/LoadHtmlBrowser.java +++ /dev/null @@ -1,58 +0,0 @@ -package apoc.load; - -import io.github.bonigarcia.wdm.WebDriverManager; -import org.apache.commons.io.IOUtils; -import org.openqa.selenium.By; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.chrome.ChromeDriver; -import org.openqa.selenium.chrome.ChromeOptions; -import org.openqa.selenium.firefox.FirefoxDriver; -import org.openqa.selenium.firefox.FirefoxOptions; -import org.openqa.selenium.support.ui.Wait; -import org.openqa.selenium.support.ui.WebDriverWait; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Map; - -public class LoadHtmlBrowser { - - public static InputStream getChromeInputStream(String url, Map query, LoadHtmlConfig config, boolean isHeadless, boolean isAcceptInsecureCerts) { - WebDriverManager.chromedriver().setup(); - ChromeOptions chromeOptions = new ChromeOptions(); - chromeOptions.setHeadless(isHeadless); - chromeOptions.setAcceptInsecureCerts(isAcceptInsecureCerts); - return getInputStreamWithBrowser(url, query, config, new ChromeDriver(chromeOptions)); - } - - public static InputStream getFirefoxInputStream(String url, Map query, LoadHtmlConfig config, boolean isHeadless, boolean isAcceptInsecureCerts) { - WebDriverManager.firefoxdriver().setup(); - FirefoxOptions firefoxOptions = new FirefoxOptions(); - firefoxOptions.setHeadless(isHeadless); - firefoxOptions.setAcceptInsecureCerts(isAcceptInsecureCerts); - return getInputStreamWithBrowser(url, query, config, new FirefoxDriver(firefoxOptions)); - } - - private static InputStream getInputStreamWithBrowser(String url, Map query, LoadHtmlConfig config, WebDriver driver) { - driver.get(url); - - final long wait = config.getWait(); - if (wait > 0) { - Wait driverWait = new WebDriverWait(driver, wait); - try { - driverWait.until(webDriver -> query.values().stream() - .noneMatch(selector -> webDriver.findElements(By.cssSelector(selector)).isEmpty())); - } catch (org.openqa.selenium.TimeoutException ignored) { - // We continue the execution even if 1 or more elements were not found - } - } - InputStream stream; - try { - stream = IOUtils.toInputStream(driver.getPageSource(), config.getCharset()); - } catch (IOException e) { - throw new RuntimeException(e); - } - driver.close(); - return stream; - } -} diff --git a/full/src/main/java/apoc/load/LoadHtmlConfig.java b/full/src/main/java/apoc/load/LoadHtmlConfig.java deleted file mode 100644 index 1f3e66d94a..0000000000 --- a/full/src/main/java/apoc/load/LoadHtmlConfig.java +++ /dev/null @@ -1,67 +0,0 @@ -package apoc.load; - -import apoc.util.Util; - -import java.util.Collections; -import java.util.Map; - -public class LoadHtmlConfig { - enum Browser { NONE, CHROME, FIREFOX } - enum FailSilently { FALSE, WITH_LOG, WITH_LIST } - - private final boolean headless; - private final boolean acceptInsecureCerts; - private final boolean children; - - private final String charset; - private final String baseUri; - - private final Browser browser; - private final FailSilently failSilently; - - private final long wait; - - public LoadHtmlConfig(Map config) { - if (config == null) config = Collections.emptyMap(); - this.headless = Util.toBoolean(config.getOrDefault("headless", true)); - this.acceptInsecureCerts = Util.toBoolean(config.getOrDefault("acceptInsecureCerts", true)); - this.children = Util.toBoolean(config.getOrDefault("children", false)); - this.charset = (String) config.getOrDefault("charset", "UTF-8"); - this.baseUri = (String) config.getOrDefault("baseUri", ""); - this.browser = Browser.valueOf((String) config.getOrDefault("browser", Browser.NONE.toString())); - this.failSilently = FailSilently.valueOf((String) config.getOrDefault("failSilently", FailSilently.FALSE.toString())); - this.wait = Util.toLong(config.getOrDefault("wait", 0)); - } - - public boolean isHeadless() { - return headless; - } - - public boolean isAcceptInsecureCerts() { - return acceptInsecureCerts; - } - - public boolean isChildren() { - return children; - } - - public String getCharset() { - return charset; - } - - public String getBaseUri() { - return baseUri; - } - - public Browser getBrowser() { - return browser; - } - - public FailSilently getFailSilently() { - return failSilently; - } - - public long getWait() { - return wait; - } -} diff --git a/full/src/test/java/apoc/load/LoadHtmlTest.java b/full/src/test/java/apoc/load/LoadHtmlTest.java index 92f9643e9f..f5afea4130 100644 --- a/full/src/test/java/apoc/load/LoadHtmlTest.java +++ b/full/src/test/java/apoc/load/LoadHtmlTest.java @@ -13,8 +13,6 @@ import java.util.Collections; import java.util.List; import java.util.Map; -import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; import static apoc.load.LoadHtml.KEY_ERROR; import static apoc.util.MapUtil.map; @@ -24,11 +22,10 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertTrue; public class LoadHtmlTest { - protected static final String RESULT_QUERY_METADATA = ("{attributes={charset=UTF-8}, tagName=meta}, " + + private static final String RESULT_QUERY_METADATA = ("{attributes={charset=UTF-8}, tagName=meta}, " + "{attributes={name=ResourceLoaderDynamicStyles}, tagName=meta}, " + "{attributes={name=generator, content=MediaWiki 1.32.0-wmf.18}, tagName=meta}, " + "{attributes={name=referrer, content=origin}, tagName=meta}, " + @@ -36,16 +33,16 @@ public class LoadHtmlTest { "{attributes={name=referrer, content=origin-when-cross-origin}, tagName=meta}, " + "{attributes={property=og:image, content=https://upload.wikimedia.org/wikipedia/en/e/ea/Aap_Kaa_Hak_titles.jpg}, tagName=meta}"); - protected static final String RESULT_QUERY_H2 = ("{text=Contents, tagName=h2}, " + + private static final String RESULT_QUERY_H2 = ("{text=Contents, tagName=h2}, " + "{text=Origins[edit], tagName=h2}, " + "{text=Content[edit], tagName=h2}, " + "{text=Legacy[edit], tagName=h2}, " + "{text=References[edit], tagName=h2}, " + "{text=Navigation menu, tagName=h2}"); - protected static final String INVALID_PATH = new File("src/test/resources/wikipedia1.html").toURI().toString(); - protected static final String VALID_PATH = new File("src/test/resources/wikipedia.html").toURI().toString(); - protected static final String INVALID_CHARSET = "notValid"; + private static final String INVALID_PATH = new File("src/test/resources/wikipedia1.html").toURI().toString(); + private static final String VALID_PATH = new File("src/test/resources/wikipedia.html").toURI().toString(); + private static final String INVALID_CHARSET = "notValid"; @Rule public DbmsRule db = new ImpermanentDbmsRule(); @@ -56,56 +53,20 @@ public void setup() { } @Test - public void testParseGeneratedJs() { - testCallGeneratedJsWithBrowser("FIREFOX"); - testCallGeneratedJsWithBrowser("CHROME"); - } - - @Test - public void testWithWaitUntilAndOneElementNotFound() { - testCall(db, "CALL apoc.load.html($url,$query,$config)", - map("url",new File("src/test/resources/html/wikipediaWithJs.html").toURI().toString(), - "query", map("elementExistent", "strong", "elementNotExistent", ".asdfgh"), - "config", map("browser", "CHROME", "wait", 5)), - result -> { - Map value = (Map) result.get("value"); - List> notExistent = (List>) value.get("elementNotExistent"); - List> existent = (List>) value.get("elementExistent"); - assertTrue(notExistent.isEmpty()); - assertEquals(1, existent.size()); - final Map tag = existent.get(0); - assertEquals("This is a new text node", tag.get("text")); - assertEquals("strong", tag.get("tagName")); - }); - } + public void testQueryAll(){ + Map query = map("metadata", "meta", "h2", "h2"); - @Test - public void testWithBaseUriConfig() { - Map query = map("urlTest", ".urlTest"); - - final String baseUri = new File("src/test/resources").toURI().toString(); - testCall(db, "CALL apoc.load.html($url,$query, $config)", - map("url", new File("src/test/resources/html/wikipediaWithJs.html").toURI().toString(), - "query", query, - "config", map("baseUri", baseUri)), + testResult(db, "CALL apoc.load.html($url,$query, $config)", map("url",new File("src/test/resources/wikipedia.html").toURI().toString(), "query", query, "config", Collections.emptyMap()), result -> { - Map value = (Map) result.get("value"); - final List> urlTestList = (List>) value.get("urlTest"); - Map absoluteUrlTag = map("tagName", "a", "text", "absoluteUrl", - "attributes", map("href", "https://foundation.wikimedia.org/wiki/Privacy_policy", "class", "urlTest")); - - Map urlSameUrlTag = map("tagName", "a", "text", "urlSamePath", - "attributes", map("href", baseUri + "this.js", "class", "urlTest")); - - Map forwardUrlTag = map("tagName", "a", "text", "forwardUrl", - "attributes", map("href", "file:/test.js", "class", "urlTest")); + Map row = result.next(); + Map value = (Map) row.get("value"); - Map backUrlTag = map("tagName", "a", "text", "backUrl", - "attributes", map("href", baseUri.replace("test/resources/", "backUrl.js"), "class", "urlTest")); + List> metadata = (List>) value.get("metadata"); + List> h2 = (List>) value.get("h2"); - final Set> expectedSetList = Set.of(absoluteUrlTag, urlSameUrlTag, forwardUrlTag, backUrlTag); - assertEquals(expectedSetList, Set.copyOf(urlTestList)); - }); + assertEquals(asList(RESULT_QUERY_METADATA).toString().trim(), metadata.toString().trim()); + assertEquals(asList(RESULT_QUERY_H2).toString().trim(), h2.toString().trim()); + }); } @Test @@ -132,6 +93,45 @@ public void testQueryH2(){ }); } + @Test + public void testQueryH2WithConfig(){ + Map query = map("h2", "h2"); + Map config = map("charset", "UTF-8", "baserUri", ""); + + testResult(db, "CALL apoc.load.html($url,$query, $config)", map("url",new File("src/test/resources/wikipedia.html").toURI().toString(), "query", query, "config", config), + result -> { + Map row = result.next(); + assertEquals(map("h2",asList(RESULT_QUERY_H2)).toString().trim(), row.get("value").toString().trim()); + assertFalse(result.hasNext()); + }); + } + + @Test + public void testQueryWithChildren() { + Map query = map("toc", ".toc ul"); + Map config = map("children", true); + + testResult(db, "CALL apoc.load.html($url,$query, $config)", map("url",new File("src/test/resources/wikipedia.html").toURI().toString(), "query", query, "config", config), + result -> { + Map row = result.next(); + Map value = (Map) row.get("value"); + + List> toc = (List) value.get("toc"); + Map first = toc.get(0); + + // Should be
    + assertEquals("ul", first.get("tagName")); + + // Should have four children + assertEquals(4, ((List) first.get("children")).size()); + + Map firstChild = (Map)((List) first.get("children")).get(0); + + assertEquals("li", firstChild.get("tagName")); + assertEquals(1, ((List) firstChild.get("children")).size()); + }); + } + @Test public void testQueryWithFailsSilentlyWithLog() { Map query = map("a", "a", "invalid", "invalid", "h6", "h6"); @@ -210,12 +210,12 @@ public void testQueryWithExceptionIfIncorrectUrl() { @Test(expected = QueryExecutionException.class) public void testQueryWithFailsSilentlyWithLogWithExceptionIfIncorrectUrl() { - testIncorrectUrl("CALL apoc.load.html('" + INVALID_PATH + "',{a:'a'}, {failSilently: 'WITH_LOG'})"); + testIncorrectUrl("CALL apoc.load.html('" + INVALID_PATH + "',{failSilently: 'WITH_LOG', a:'a'})"); } @Test(expected = QueryExecutionException.class) public void testQueryWithFailsSilentlyWithListWithExceptionIfIncorrectUrl() { - testIncorrectUrl("CALL apoc.load.html('" + INVALID_PATH + "',{a:'a'}, {failSilently: 'WITH_LIST'})"); + testIncorrectUrl("CALL apoc.load.html('" + INVALID_PATH + "',{failSilently: 'WITH_LIST', a:'a'})"); } @Test(expected = QueryExecutionException.class) @@ -233,20 +233,6 @@ public void testQueryWithFailsSilentlyWithListWithExceptionIfIncorrectCharset() testIncorrectCharset("CALL apoc.load.html('" + VALID_PATH + "',{a:'a'}, {failSilently: 'WITH_LIST', charset: '" + INVALID_CHARSET + "'})"); } - @Test(expected = QueryExecutionException.class) - public void testFailsWithIncorrectBrowser() { - final String invalidValue = "NOT_VALID"; - final Map config = Map.of("browser", invalidValue); - try { - testCall(db, "CALL apoc.load.html('" + VALID_PATH + "',{a:'a'}, $config)", Map.of("config", config), (r) -> {}); - } catch (Exception e) { - Throwable except = ExceptionUtils.getRootCause(e); - String expectedMessage = "No enum constant " + LoadHtmlConfig.Browser.class.getCanonicalName() + "." + invalidValue; - assertEquals(expectedMessage, except.getMessage()); - throw e; - } - } - private void testIncorrectCharset(String query) { try { testCall(db, query, (r) -> {}); @@ -268,27 +254,4 @@ private void testIncorrectUrl(String query) { throw e; } } - - private void testCallGeneratedJsWithBrowser(String browser) { - testCall(db, "CALL apoc.load.html($url,$query,$config)", - map("url",new File("src/test/resources/html/wikipediaWithJs.html").toURI().toString(), - "query", map("td", "td", "strong", "strong"), - "config", map("browser", browser)), - result -> { - Map value = (Map) result.get("value"); - List> tdList = (List>) value.get("td"); - List> strongList = (List>) value.get("strong"); - assertEquals(4, tdList.size()); - final String templateString = "foo bar - baz"; - AtomicInteger integer = new AtomicInteger(); - tdList.forEach(tag -> { - assertEquals("td", tag.get("tagName")); - assertEquals(integer.getAndIncrement() + templateString, tag.get("text")); - }); - assertEquals(1, strongList.size()); - final Map tagStrong = strongList.get(0); - assertEquals("This is a new text node", tagStrong.get("text")); - assertEquals("strong", tagStrong.get("tagName")); - }); - } } diff --git a/full/src/test/java/apoc/load/LoadHtmlTestParameterized.java b/full/src/test/java/apoc/load/LoadHtmlTestParameterized.java deleted file mode 100644 index 3b9cc8123c..0000000000 --- a/full/src/test/java/apoc/load/LoadHtmlTestParameterized.java +++ /dev/null @@ -1,125 +0,0 @@ -package apoc.load; - -import apoc.util.TestUtil; -import org.junit.Before; -import org.junit.Rule; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.junit.runners.Parameterized.Parameter; -import org.junit.runners.Parameterized.Parameters; -import org.neo4j.test.rule.DbmsRule; -import org.neo4j.test.rule.ImpermanentDbmsRule; - -import java.io.File; -import java.util.Collection; -import java.util.List; -import java.util.Map; - -import static apoc.load.LoadHtmlTest.RESULT_QUERY_H2; -import static apoc.load.LoadHtmlTest.RESULT_QUERY_METADATA; -import static apoc.util.MapUtil.map; -import static apoc.util.TestUtil.testResult; -import static com.google.common.collect.Lists.newArrayList; -import static java.util.Arrays.asList; -import static java.util.Collections.emptyMap; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; - -@RunWith(Parameterized.class) -public class LoadHtmlTestParameterized { - // Tests taken from LoadHtmlTest.java. - // To check that `browser` configuration preserve the result. - - @Rule - public DbmsRule db = new ImpermanentDbmsRule(); - - @Before - public void setup() { - TestUtil.registerProcedure(db, LoadHtml.class); - } - - - @Parameters - public static Collection data() { - return List.of("notSet", "NONE", "CHROME", "FIREFOX"); - } - - @Parameter - public String browser; - - - @Test - public void testQueryAll() { - Map query = map("metadata", "meta", "h2", "h2"); - - Map config = browserSet() ? Map.of("browser", browser) : emptyMap(); - testResult(db, "CALL apoc.load.html($url,$query, $config)", - map("url",new File("src/test/resources/wikipedia.html").toURI().toString(), "query", query, "config", config), - result -> { - Map row = result.next(); - Map value = (Map) row.get("value"); - - List> metadata = (List>) value.get("metadata"); - List> h2 = (List>) value.get("h2"); - - assertEquals(asList(RESULT_QUERY_METADATA).toString().trim(), metadata.toString().trim()); - assertEquals(asList(RESULT_QUERY_H2).toString().trim(), h2.toString().trim()); - }); - } - - @Test - public void testQueryH2WithConfig() { - Map query = map("h2", "h2"); - final List confList = newArrayList("charset", "UTF-8", "baseUri", ""); - addBrowserIfSet(confList); - Map config = map(confList.toArray()); - - testResult(db, "CALL apoc.load.html($url, $query, $config)", - map("url",new File("src/test/resources/wikipedia.html").toURI().toString(), "query", query, "config", config), - result -> { - Map row = result.next(); - assertEquals(map("h2",asList(RESULT_QUERY_H2)).toString().trim(), row.get("value").toString().trim()); - assertFalse(result.hasNext()); - }); - } - - @Test - public void testQueryWithChildren() { - Map query = map("toc", ".toc ul"); - final List confList = newArrayList("children", true); - addBrowserIfSet(confList); - Map config = map(confList.toArray()); - - testResult(db, "CALL apoc.load.html($url, $query, $config)", - map("url",new File("src/test/resources/wikipedia.html").toURI().toString(), "query", query, "config", config), - result -> { - Map row = result.next(); - Map value = (Map) row.get("value"); - - List> toc = (List) value.get("toc"); - Map first = toc.get(0); - - // Should be
      - assertEquals("ul", first.get("tagName")); - - // Should have four children - assertEquals(4, ((List) first.get("children")).size()); - - Map firstChild = (Map)((List) first.get("children")).get(0); - - assertEquals("li", firstChild.get("tagName")); - assertEquals(1, ((List) firstChild.get("children")).size()); - }); - } - - private void addBrowserIfSet(List confList) { - if (browserSet()) { - confList.addAll(List.of("browser", browser)); - } - } - - private boolean browserSet() { - return !browser.equals("notSet"); - } -} diff --git a/full/src/test/resources/html/wikipediaWithJs.html b/full/src/test/resources/html/wikipediaWithJs.html deleted file mode 100644 index 503d57e7c3..0000000000 --- a/full/src/test/resources/html/wikipediaWithJs.html +++ /dev/null @@ -1,37 +0,0 @@ - - - - - - - - - - -Aap Kaa Hak - Wikipedia - - -
      -
      - - - - - -
      col1col2
      -
      - - - diff --git a/full/src/test/resources/loadData.js b/full/src/test/resources/loadData.js deleted file mode 100644 index e291a3e176..0000000000 --- a/full/src/test/resources/loadData.js +++ /dev/null @@ -1,17 +0,0 @@ -// append rows and cols to table.data in page.html -function loadData() { - data = document.getElementById("data"); - let index = 0; - for (let row = 0; row < 2; row++) { - let tr = document.createElement("tr"); - for (let col = 0; col < 2; col++) { - td = document.createElement("td"); - td.appendChild(document.createTextNode(index + `foo - bar - - baz`)); - tr.appendChild(td); - index++; - } - data.appendChild(tr); - } -} diff --git a/full/src/test/resources/wikipedia.html b/full/src/test/resources/wikipedia.html index 36f6e34464..ef3500ad68 100644 --- a/full/src/test/resources/wikipedia.html +++ b/full/src/test/resources/wikipedia.html @@ -238,7 +238,7 @@
      incorrecttest
      - + diff --git a/full/src/test/resources/wikipediaWithJs.html b/full/src/test/resources/wikipediaWithJs.html deleted file mode 100644 index 93d096bac8..0000000000 --- a/full/src/test/resources/wikipediaWithJs.html +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - -Aap Kaa Hak - Wikipedia - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      - - - - - -
      col1col2
      -
      - -