diff --git a/doc/Deep-Learning-models.md b/doc/Deep-Learning-models.md index 1d72066ec5..dec4f3d0c3 100644 --- a/doc/Deep-Learning-models.md +++ b/doc/Deep-Learning-models.md @@ -4,7 +4,7 @@ Since GROBID version `0.5.4` (2018), it is possible to use in GROBID recent Deep Learning sequence labelling models trained with [DeLFT](https://github.com/kermitt2/delft). The available neural models include in particular BidLSTM-CRF with Glove embeddings, with additional feature channel (for layout features), with ELMo, and transformer-based fine-tuned architectures with or without CRF activation layer (e.g. SciBERT-CRF), which can be used as alternative to the default Wapiti CRF. -These architectures have been tested on Linux 64bit and macOS. +These architectures have been tested on Linux 64bit and macOS 64bit. The support to the macOS ARM is in progress. Integration is realized via Java Embedded Python [JEP](https://github.com/ninia/jep), which uses a JNI of CPython. This integration is two times faster than the Tensorflow Java API and significantly faster than RPC serving (see [here](https://www.slideshare.net/FlinkForward/flink-forward-berlin-2017-dongwon-kim-predictive-maintenance-with-apache-flink). Additionally, it does not require to modify DeLFT as it would be the case with Py4J gateway (socket-based). diff --git a/doc/Introduction.md b/doc/Introduction.md index 01142c136f..18ca0cbc78 100644 --- a/doc/Introduction.md +++ b/doc/Introduction.md @@ -57,7 +57,7 @@ The key aspects of GROBID are the following ones: By default, the GROBID extraction and parsing algorithms use a [fork](https://github.com/kermitt2/wapiti) of [Wapiti CRF library](http://wapiti.limsi.fr). As alternative, it is possible to perform the sequence labelling with [DeLFT](https://github.com/kermitt2/delft) deep learning models (typically BidLSTM-CRF with or without ELMo, or BERT-CRF, with additional feature channels) instead of Wapiti CRF models, using a native integration via [JEP](https://github.com/ninia/jep). The native libraries, in particular TensorFlow, are transparently integrated as JNI with dynamic call based on the current OS. Deep Learning models should be used when accuracy is the main priority, they often involve reduced scalability. See the related [benchmarking](End-to-end-evaluation.md). -GROBID should run properly "out of the box" on Linux (32 and 64 bits) and macOS. +GROBID should run properly "out of the box" on Linux (32 and 64 bits) and macOS (Intel and ARM). ## Credits diff --git a/doc/Troubleshooting.md b/doc/Troubleshooting.md index 5632ce8003..a407965196 100644 --- a/doc/Troubleshooting.md +++ b/doc/Troubleshooting.md @@ -36,7 +36,7 @@ In case of running on limited memory hardware, there are various ways to deal wi ### Windows related issues -Grobid is developed and tested on Linux. macOS is also supported, although some components might behave slighly different due to the natural incompatibility of Apple with the rest of the world and the availability on some proprietary fonts on this platform. +Grobid is developed and tested on Linux. macOS is also supported, although some components might behave slightly different due to the natural incompatibility of Apple with the rest of the world and the availability on some proprietary fonts on this platform. Windows, unfortunately, is currently not anymore supported, due to lack of experience and time constraints. We recommend Windows users to use the [Grobid Docker image](https://hub.docker.com/r/lfoppiano/grobid/) (documented [here](Grobid-docker.md)) and call the system via API using one of the various [grobid clients](Grobid-service.md#Clients-for-GROBID-Web-Services). diff --git a/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java b/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java index af2bf9aa5d..de88ab3aa4 100755 --- a/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java +++ b/grobid-core/src/main/java/org/grobid/core/utilities/Utilities.java @@ -287,8 +287,13 @@ public static BiblioItem cleanZFNMetadata(BiblioItem item) { public static String getOsNameAndArch() { String osPart = System.getProperty("os.name").replace(" ", "") .toLowerCase().substring(0, 3); + if (StringUtils.equals(osPart, "mac")) { + if (StringUtils.equals(System.getProperty("os.arch"), "aarch64")){ + osPart = osPart+"_arm"; + } + } String archPart = System.getProperty("sun.arch.data.model"); - return String.format("%s-%s", osPart, archPart); + return String.format("%s-%s", osPart, archPart); } /** diff --git a/grobid-home/lib/mac_arm-64/libwapiti.dylib b/grobid-home/lib/mac_arm-64/libwapiti.dylib new file mode 100755 index 0000000000..35505dd059 Binary files /dev/null and b/grobid-home/lib/mac_arm-64/libwapiti.dylib differ diff --git a/grobid-home/pdfalto/mac_arm-64/pdfalto b/grobid-home/pdfalto/mac_arm-64/pdfalto new file mode 100755 index 0000000000..0ce2903882 Binary files /dev/null and b/grobid-home/pdfalto/mac_arm-64/pdfalto differ diff --git a/grobid-home/pdfalto/mac_arm-64/pdfalto_server b/grobid-home/pdfalto/mac_arm-64/pdfalto_server new file mode 100755 index 0000000000..07f1b001cb --- /dev/null +++ b/grobid-home/pdfalto/mac_arm-64/pdfalto_server @@ -0,0 +1,43 @@ +#!/bin/bash + +# Timeout. +timeout=20 # 20 seconds +# Interval between checks if the process is still alive. +interval=1 +# Delay between posting the SIGTERM signal and destroying the process by SIGKILL. +delay=0 +command=${0:0:${#0}-7} + +args=("$@") +pdfalto_params=() + +for ((n=0; n<="$#";n++)); do + case ${args[n]} in + --timeout) + timeout=${args[n+1]} + ((n++)) + ;; + *) + pdfalto_params+=" ${args[n]}" + ;; + esac +done + +# kill -0 pid Exit code indicates if a signal may be sent to $pid process. +( + ((t = timeout)) + + while ((t > 0)); do + sleep $interval + kill -0 $$ || exit 0 + ((t -= interval)) + done + + # Be nice, post SIGTERM first. + # The 'exit 0' below will be executed if any preceeding command fails. + kill -s SIGTERM $$ && kill -0 $$ || exit 0 + sleep $delay + kill -s SIGKILL $$ +) 2> /dev/null & + +exec $command $pdfalto_params diff --git a/grobid-home/pdfalto/mac_arm-64/xpdfrc b/grobid-home/pdfalto/mac_arm-64/xpdfrc new file mode 100644 index 0000000000..5b909e6b33 --- /dev/null +++ b/grobid-home/pdfalto/mac_arm-64/xpdfrc @@ -0,0 +1,61 @@ +#----- begin Arabic support package (2011-aug-15) +unicodeMap ISO-8859-6 ../languages/xpdf-arabic/ISO-8859-6.unicodeMap +#----- end Arabic support package +#----- begin Chinese Simplified support package (2011-sep-02) +cidToUnicode Adobe-GB1 ../languages/xpdf-chinese-simplified/Adobe-GB1.cidToUnicode +unicodeMap ISO-2022-CN ../languages/xpdf-chinese-simplified/ISO-2022-CN.unicodeMap +unicodeMap EUC-CN ../languages/xpdf-chinese-simplified/EUC-CN.unicodeMap +unicodeMap GBK ../languages/xpdf-chinese-simplified/GBK.unicodeMap +cMapDir Adobe-GB1 ../languages/xpdf-chinese-simplified/CMap +toUnicodeDir ../languages/xpdf-chinese-simplified/CMap +#fontFileCC Adobe-GB1 /usr/..../NotoSansCJKsc-Regular.otf +#----- end Chinese Simplified support package +#----- begin Chinese Traditional support package (2011-sep-02) +cidToUnicode Adobe-CNS1 ../languages/xpdf-chinese-traditional/Adobe-CNS1.cidToUnicode +unicodeMap Big5 ../languages/xpdf-chinese-traditional/Big5.unicodeMap +unicodeMap Big5ascii ../languages/xpdf-chinese-traditional/Big5ascii.unicodeMap +cMapDir Adobe-CNS1 ../languages/xpdf-chinese-traditional/CMap +toUnicodeDir ../languages/xpdf-chinese-traditional/CMap +#fontFileCC Adobe-CNS1 /usr/..../NotoSansCJKtc-Regular.otf" +#----- end Chinese Traditional support package +#----- begin Cyrillic support package (2011-aug-15) +nameToUnicode ../languages/xpdf-cyrillic/Bulgarian.nameToUnicode +unicodeMap KOI8-R ../languages/xpdf-cyrillic/KOI8-R.unicodeMap +#----- end Cyrillic support package +#----- begin Greek support package (2011-aug-15) +nameToUnicode ../languages/xpdf-greek/Greek.nameToUnicode +unicodeMap ISO-8859-7 ../languages/xpdf-greek/ISO-8859-7.unicodeMap +#----- end Greek support package +#----- begin Hebrew support package (2011-aug-15) +unicodeMap ISO-8859-8 ../languages/xpdf-hebrew/ISO-8859-8.unicodeMap +unicodeMap Windows-1255 ../languages/xpdf-hebrew/Windows-1255.unicodeMap +#----- end Hebrew support package +#----- begin Japanese support package (2011-sep-02) +cidToUnicode Adobe-Japan1 ../languages/xpdf-japanese/Adobe-Japan1.cidToUnicode +unicodeMap ISO-2022-JP ../languages/xpdf-japanese/ISO-2022-JP.unicodeMap +unicodeMap EUC-JP ../languages/xpdf-japanese/EUC-JP.unicodeMap +unicodeMap Shift-JIS ../languages/xpdf-japanese/Shift-JIS.unicodeMap +cMapDir Adobe-Japan1 ../languages/xpdf-japanese/CMap +toUnicodeDir ../languages/xpdf-japanese/CMap +#fontFileCC Adobe-Japan1 /usr/..../NotoSansCJKjp-Regular.otf +#----- end Japanese support package +#----- begin Korean support package (2011-sep-02) +cidToUnicode Adobe-Korea1 ../languages/xpdf-korean/Adobe-Korea1.cidToUnicode +cidToUnicode Adobe-KR ../languages/xpdf-korean/Adobe-KR.cidToUnicode +unicodeMap ISO-2022-KR ../languages/xpdf-korean/ISO-2022-KR.unicodeMap +cMapDir Adobe-Korea1 ../languages/xpdf-korean/CMap +cMapDir Adobe-KR ../languages/xpdf-korean/CMap +toUnicodeDir ../languages/xpdf-korean/CMap +#fontFileCC Adobe-Korea1 /usr/..../NotoSansCJKkr-Regular.otf +#fontFileCC Adobe-KR /usr/..../NotoSansCJKkr-Regular.otf +#----- end Korean support package +#----- begin Latin2 support package (2011-aug-15) +unicodeMap Latin2 ../languages/xpdf-latin2/Latin2.unicodeMap +#----- end Latin2 support package +#----- begin Thai support package (2011-aug-15) +nameToUnicode ../languages/xpdf-thai/Thai.nameToUnicode +unicodeMap TIS-620 ../languages/xpdf-thai/TIS-620.unicodeMap +#----- end Thai support package +#----- begin Turkish support package (2011-aug-15) +unicodeMap ISO-8859-9 ../languages/xpdf-turkish/ISO-8859-9.unicodeMap +#----- end Turkish support package