diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 0000000..7244367 --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,84 @@ +@Article{Julia, + Title = {{J}ulia: A Fresh Approach to Numerical Computing}, + Author = {Jeff Bezanson and Alan Edelman and Stefan Karpinski and Viral B. Shah}, + Year = {2014}, + Eprint = {1411.1607}, + Eprintclass = {cs.MS}, + Eprinttype = {arXiv}, + Keywords = {tools}, + Timestamp = {2015.12.18}, + Url = {http://arxiv.org/abs/1411.1607} +} + +@Inproceedings{NLTK1, + Title = {NLTK: the natural language toolkit}, + Author = {Bird, Steven and Loper, Edward}, + Booktitle = {Proceedings of the ACL 2004 on Interactive poster and demonstration sessions}, + Year = {2004}, + Organization = {Association for Computational Linguistics}, + Pages = {31}, + Timestamp = {2018.02.07}, + Url = {http://www.aclweb.org/anthology/P04-3031} +} +@Book{NLTK2, + Title = {Natural language processing with Python}, + Author = {Bird, Steven and Klein, Ewan and Loper, Edward}, + Publisher = {" O'Reilly Media, Inc."}, + Year = {2009}, + Keywords = {software, tools}, + Timestamp = {2015.07.12}, + Url = {http://www.nltk.org/} +} + +@electronic{penntok, + author = {MacIntyre, Robert}, + title = {Sed script to produce Penn Treebank tokenization on arbitrary raw text.}, + organization = "Massachusetts Institute of Technology", + url = {https://web.archive.org/web/20130804202913/http://www.cis.upenn.edu/%7Etreebank/tokenizer.sed}, + urldate = {31.08.2018}, + year = {1995} +} + +@electronic{toktok, + author = {Dehdari, Jonathan}, + title = {tok-tok: A fast, simple, multilingual tokenizer }, + url = {https://github.com/jonsafari/tok-tok}, + urldate = {31.08.2018}, + year = {2015} +} +@phdthesis{toktokpub, + title={A Neurophysiologically-Inspired Statistical Language Model}, + author={Dehdari, Jonathan}, + year={2014}, + school={The Ohio State University} +} + +@article{reversibletok1, + author = {Sebastian J. Mielke and Jason Eisner}, + title = {Spell Once, Summon Anywhere: {A} Two-Level Open-Vocabulary Language Model}, + journal = {CoRR}, + volume = {abs/1804.08205}, + year = {2018}, + url = {http://arxiv.org/abs/1804.08205}, + archivePrefix = {arXiv}, + eprint = {1804.08205}, + timestamp = {Mon, 13 Aug 2018 16:49:01 +0200}, + biburl = {https://dblp.org/rec/bib/journals/corr/abs-1804-08205}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@online{reversibletok2, + author = {Sebastian J. Mielke}, + title = {A simple, reversible, language-agnostic tokenizer}, + year = {2019}, + url = {https://sjmielke.com/papers/tokenize/}, + urldate = {22.04.2018} +} + +@online{tweettok, + author = {Christopher Potts}, + title = {Sentiment Symposium Tutorial: Tokenizing}, + year = {2019}, + url = {http://sentiment.christopherpotts.net/tokenizing.html#sentiment}, + urldate = {2011} +} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..13c4772 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,88 @@ +--- +title: 'WordTokenizers.jl: Basic tools for tokenizing natural language in Julia' +tags: + - julialang + - natural language processing (NLP) + - tokenization + - text mining + - information retrieval +authors: + - name: Ayush Kaushal + orcid: 0000-0002-6703-0728 + affiliation: 1 + - name: Lyndon White + orcid: 0000-0003-1386-1646 + affiliation: 2 + - name: Mike Innes + orcid: 0000-0003-0788-0242 + affiliation: 3 + - name: Rohit Kumar + orcid: 0000-0002-6758-8350 + affiliation: 4 + +affiliations: + - name: Indian Institute of Technology, Kharagpur + index: 1 + - name: The University of Western Australia + index: 2 + - name: Julia Computing + index: 3 + - name: ABV-Indian Institute of Information Technology and Management Gwalior + index: 4 + +date: 1 July 2019 +bibliography: paper.bib +--- + +# Summary + +WordTokenizers.jl is a tool to help users of the Julia programming language ([@Julia]), work with natural language. +In natural language processing (NLP) tokenization refers to breaking a text up into parts -- the tokens. +Generally, tokenization refers to breaking a sentence up into words and other tokens such as punctuation. +Such _word tokenization_ also often includes some normalizing, such as correcting unusual spellings or removing all punctuations. +Complementary to word tokenization is _sentence segmentation_ (sometimes called _sentence tokenization_), +where a document is broken up into sentences, which can then be tokenized into words. +Tokenization and sentence segmentation are some of the most fundamental operations to be performed before applying most NLP or information retrieval algorithms. + +WordTokenizers.jl provides a flexible API for defining fast tokenizers and sentence segmentors. +Using this API several standard tokenizers and sentence segmenters have been implemented, allowing researchers and practitioners to focus on the higher details of their NLP tasks. + +WordTokenizers.jl does not implement significant novel tokenizers or sentence segmenters. +Rather, it contains ports/implementations the well-established and commonly used algorithms. +At present, it contains rules-based methods primarily designed for English. +Several of the implementations are sourced from the Python NLTK project ([@NLTK1], [@NLTK2]); +although these were in turn sourced from older pre-existing methods. + +WordTokenizers.jl uses a `TokenBuffer` API and its various lexers for fast word tokenization. +`TokenBuffer` turns the string into a readable stream. +A desired set of TokenBuffer lexers are used to read characters from the stream and flush out into an array of tokens. +The package provides the following tokenizers made using this API. + +- A Tweet Tokenizer([@tweettok]) for casual text. +- A general purpose NLTK Tokenizer([@NLTK1, @NLTK2]). +- An improved version of the multilingual Tok-tok tokenizer([@toktok], [@toktokpub]). + +With various lexers written for the `TokenBuffer` API, users can also create their high-speed custom tokenizers with ease. +The package also provides a simple reversible tokenizer ([@reversibletok1], [@reversibletok2]), +that works by leaving certain merge symbols, as a means to reconstruct tokens into the original string. + +WordTokenizers.jl exposes a configurable default interface, +which allows the tokenizer and sentence segmenters to be configured globally (where this is used). +This allowed for easy benchmarking and comparisons of different methods. + +WordTokenizers.jl is currently being used by packages like [TextAnalysis.jl](https://github.com/JuliaText/TextAnalysis.jl), [Transformers.jl](https://github.com/chengchingwen/Transformers.jl) and [CorpusLoaders.jl](https://github.com/JuliaText/CorpusLoaders.jl) for tokenizing text. + +## Other similar softwares + +![Speed comparison of Tokenizers on IMDB Movie Review Dataset](speed_compare.png) + +There are various NLP libraries and toolkits written in other programming languages, available to Julia users for tokenization. +[NLTK](https://github.com/nltk/nltk) and [Spacy](https://github.com/explosion/spaCy) packages provide with a variety of tokenizers, accessed to Julia users via `PyCall`. +Shown above is a performance benchmark of using some of the WordTokenizers.jl tokenizers vs PyCalling the default tokenizers from NLTK and SpaCy. +This was evaluated on the ~127,000 sentences of the IMDB Movie Review Dataset. +It can be seen that the performance of WordTokenizers.jl is very strong. + +There are many more packages like [Stanford CoreNLP](https://github.com/stanfordnlp/CoreNLP), [AllenNLP](https://github.com/allenai/allennlp/) providing a couple of basic tokenizers. +However, WordTokenizers.jl is [faster](https://github.com/Ayushk4/Tweet_tok_analyse/tree/master/speed) and simpler to use, providing with a wider variety of tokenizers and a means to build custom tokenizers. + +# References diff --git a/paper/speed_compare.png b/paper/speed_compare.png new file mode 100644 index 0000000..994f9dd Binary files /dev/null and b/paper/speed_compare.png differ