From bbae44c1f1a7c669cfae699572f698d7a0b8d381 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Wed, 9 May 2018 18:16:01 -0400 Subject: [PATCH] Docs: Document how to rebuild analyzers Adds documentation for how to rebuild all the built in analyzers and tests for that documentation using the mechanism added in #29535. Closes #29499 --- .../analyzers/fingerprint-analyzer.asciidoc | 48 ++++++--------- .../analyzers/keyword-analyzer.asciidoc | 45 +++++++++++--- .../analyzers/pattern-analyzer.asciidoc | 61 +++++++++++++++---- .../analyzers/simple-analyzer.asciidoc | 42 ++++++++++--- .../analyzers/standard-analyzer.asciidoc | 54 ++++++++++++---- .../analysis/analyzers/stop-analyzer.asciidoc | 58 ++++++++++++++---- .../analyzers/whitespace-analyzer.asciidoc | 42 ++++++++++--- 7 files changed, 260 insertions(+), 90 deletions(-) diff --git a/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc b/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc index 7920e2ad847c8..8d5da5ea18e57 100644 --- a/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/fingerprint-analyzer.asciidoc @@ -9,20 +9,6 @@ Input text is lowercased, normalized to remove extended characters, sorted, deduplicated and concatenated into a single token. If a stopword list is configured, stop words will also be removed. -[float] -=== Definition - -It consists of: - -Tokenizer:: -* <> - -Token Filters (in order):: -1. <> -2. <> -3. <> (disabled by default) -4. <> - [float] === Example output @@ -150,12 +136,25 @@ The above example produces the following term: [ consistent godel said sentence yes ] --------------------------- -=== Further customization +[float] +=== Definition + +The `fingerprint` tokenizer consists of: -You can further customize the behavior of the `fingerprint` analyzer by -declaring a `custom` analyzer with the `fingerprint` token filter. The -example below recreates the "standard" fingerprint analyzer and you can -add token filters to it to change the behavior. +Tokenizer:: +* <> + +Token Filters (in order):: +* <> +* <> +* <> (disabled by default) +* <> + +If you need to customize the `fingerprint` analyzer beyond the configuration +parameters then you need to recreate it as a `custom` analyzer and modify +it, usually by adding token filters. This would recreate the built in +`fingerprint` analyzer and you can use it as a starting point for further +customization: [source,js] ---------------------------------------------------- @@ -163,19 +162,12 @@ PUT /fingerprint_example { "settings": { "analysis": { - "filter": { - "fingerprint_stop": { - "type": "stop", - "stopwords": "_english_" <1> - } - }, "analyzer": { "rebuilt_fingerprint": { - "tokenizer": "standard", + "tokenizer": "standard", "filter": [ "lowercase", "asciifolding", - "fingerprint_stop", "fingerprint" ] } @@ -186,5 +178,3 @@ PUT /fingerprint_example ---------------------------------------------------- // CONSOLE // TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: fingerprint_example, first: fingerprint, second: rebuilt_fingerprint}\nendyaml\n/] -<1> The default stopwords can be overridden with the `stopwords` - or `stopwords_path` parameters. diff --git a/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc b/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc index cc94f3b757e37..6699fc53e7134 100644 --- a/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc @@ -4,14 +4,6 @@ The `keyword` analyzer is a ``noop'' analyzer which returns the entire input string as a single token. -[float] -=== Definition - -It consists of: - -Tokenizer:: -* <> - [float] === Example output @@ -57,3 +49,40 @@ The above sentence would produce the following single term: === Configuration The `keyword` analyzer is not configurable. + +[float] +=== Definition + +The `keyword` analyzer consists of: + +Tokenizer:: +* <> + +If you need to customize the `keyword` analyzer then you need to +recreate it as a `custom` analyzer and modify it, usually by adding +token filters. Usually, you should prefer the +<> when you want strings that are not split +into tokens, but just in case you need it, this his would recreate +the built in `keyword` analyzer and you can use it as a starting +point for further customization: + +[source,js] +---------------------------------------------------- +PUT /keyword_example +{ + "settings": { + "analysis": { + "analyzer": { + "rebuilt_keyword": { + "tokenizer": "keyword", + "filter": [ <1> + ] + } + } + } + } +} +---------------------------------------------------- +// CONSOLE +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: keyword_example, first: keyword, second: rebuilt_keyword}\nendyaml\n/] +<1> You'd add any token filters here. diff --git a/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc b/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc index 64ab3999ef9a9..0319c9dc4852f 100644 --- a/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/pattern-analyzer.asciidoc @@ -19,19 +19,6 @@ Read more about http://www.regular-expressions.info/catastrophic.html[pathologic ======================================== - -[float] -=== Definition - -It consists of: - -Tokenizer:: -* <> - -Token Filters:: -* <> -* <> (disabled by default) - [float] === Example output @@ -378,3 +365,51 @@ The regex above is easier to understand as: [\p{L}&&[^\p{Lu}]] # then lower case ) -------------------------------------------------- + +[float] +=== Definition + +The `pattern` anlayzer consists of: + +Tokenizer:: +* <> + +Token Filters:: +* <> +* <> (disabled by default) + +If you need to customize the `pattern` analyzer beyond the configuration +parameters then you need to recreate it as a `custom` analyzer and modify +it, usually by adding token filters. This would recreate the built in +`pattern` analyzer and you can use it as a starting point for further +customization: + +[source,js] +---------------------------------------------------- +PUT /pattern_example +{ + "settings": { + "analysis": { + "tokenizer": { + "split_on_non_word": { + "type": "pattern", + "stopwords": "\\W+" <1> + } + }, + "analyzer": { + "rebuilt_pattern": { + "tokenizer": "split_on_non_word", + "filter": [ + "lowercase" <2> + ] + } + } + } + } +} +---------------------------------------------------- +// CONSOLE +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: pattern_example, first: pattern, second: rebuilt_pattern}\nendyaml\n/] +<1> The default pattern is `\W+` which splits on non-word characters +and this is where you'd change it. +<2> You'd add other token filters after `lowercase`. diff --git a/docs/reference/analysis/analyzers/simple-analyzer.asciidoc b/docs/reference/analysis/analyzers/simple-analyzer.asciidoc index a57c30d8dd622..0cad8a28ff7b2 100644 --- a/docs/reference/analysis/analyzers/simple-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/simple-analyzer.asciidoc @@ -4,14 +4,6 @@ The `simple` analyzer breaks text into terms whenever it encounters a character which is not a letter. All terms are lower cased. -[float] -=== Definition - -It consists of: - -Tokenizer:: -* <> - [float] === Example output @@ -127,3 +119,37 @@ The above sentence would produce the following terms: === Configuration The `simple` analyzer is not configurable. + +[float] +=== Definition + +The `simple` anlzyer consists of: + +Tokenizer:: +* <> + +If you need to customize the `simple` analyzer then you need to recreate +it as a `custom` analyzer and modify it, usually by adding token filters. +This would recreate the built in `simple` analyzer and you can use it as +a starting point for further customization: + +[source,js] +---------------------------------------------------- +PUT /simple_example +{ + "settings": { + "analysis": { + "analyzer": { + "rebuilt_simple": { + "tokenizer": "lowercase", + "filter": [ <1> + ] + } + } + } + } +} +---------------------------------------------------- +// CONSOLE +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: simple_example, first: simple, second: rebuilt_simple}\nendyaml\n/] +<1> You'd add any token filters here. diff --git a/docs/reference/analysis/analyzers/standard-analyzer.asciidoc b/docs/reference/analysis/analyzers/standard-analyzer.asciidoc index eacbb1c3cad99..cf786acc93666 100644 --- a/docs/reference/analysis/analyzers/standard-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/standard-analyzer.asciidoc @@ -7,19 +7,6 @@ Segmentation algorithm, as specified in http://unicode.org/reports/tr29/[Unicode Standard Annex #29]) and works well for most languages. -[float] -=== Definition - -It consists of: - -Tokenizer:: -* <> - -Token Filters:: -* <> -* <> -* <> (disabled by default) - [float] === Example output @@ -276,3 +263,44 @@ The above example produces the following terms: --------------------------- [ 2, quick, brown, foxes, jumpe, d, over, lazy, dog's, bone ] --------------------------- + +[float] +=== Definition + +The `standard` analyzer consists of: + +Tokenizer:: +* <> + +Token Filters:: +* <> +* <> +* <> (disabled by default) + +If you need to customize the `standard` analyzer beyond the configuration +parameters then you need to recreate it as a `custom` analyzer and modify +it, usually by adding token filters. This would recreate the built in +`standard` analyzer and you can use it as a starting point: + +[source,js] +---------------------------------------------------- +PUT /standard_example +{ + "settings": { + "analysis": { + "analyzer": { + "rebuilt_standard": { + "tokenizer": "standard", + "filter": [ + "standard", + "lowercase" <1> + ] + } + } + } + } +} +---------------------------------------------------- +// CONSOLE +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: standard_example, first: standard, second: rebuilt_standard}\nendyaml\n/] +<1> You'd add any token filters after `lowercase`. diff --git a/docs/reference/analysis/analyzers/stop-analyzer.asciidoc b/docs/reference/analysis/analyzers/stop-analyzer.asciidoc index eacc7e106e799..fcb4b916a64c5 100644 --- a/docs/reference/analysis/analyzers/stop-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/stop-analyzer.asciidoc @@ -5,17 +5,6 @@ The `stop` analyzer is the same as the <> - -Token filters:: -* <> - [float] === Example output @@ -239,3 +228,50 @@ The above example produces the following terms: --------------------------- [ quick, brown, foxes, jumped, lazy, dog, s, bone ] --------------------------- + +[float] +=== Definition + +It consists of: + +Tokenizer:: +* <> + +Token filters:: +* <> + +If you need to customize the `stop` analyzer beyond the configuration +parameters then you need to recreate it as a `custom` analyzer and modify +it, usually by adding token filters. This would recreate the built in +`stop` analyzer and you can use it as a starting point for further +customization: + +[source,js] +---------------------------------------------------- +PUT /stop_example +{ + "settings": { + "analysis": { + "filter": { + "english_stop": { + "type": "stop", + "stopwords": "_english_" <1> + } + }, + "analyzer": { + "rebuilt_stop": { + "tokenizer": "lowercase", + "filter": [ + "english_stop" <2> + ] + } + } + } + } +} +---------------------------------------------------- +// CONSOLE +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: stop_example, first: stop, second: rebuilt_stop}\nendyaml\n/] +<1> The default stopwords can be overridden with the `stopwords` + or `stopwords_path` parameters. +<2> You'd add any token filters after `english_stop`. diff --git a/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc b/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc index f95e5c6e4ab65..035e22692db64 100644 --- a/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/whitespace-analyzer.asciidoc @@ -4,14 +4,6 @@ The `whitespace` analyzer breaks text into terms whenever it encounters a whitespace character. -[float] -=== Definition - -It consists of: - -Tokenizer:: -* <> - [float] === Example output @@ -120,3 +112,37 @@ The above sentence would produce the following terms: === Configuration The `whitespace` analyzer is not configurable. + +[float] +=== Definition + +It consists of: + +Tokenizer:: +* <> + +If you need to customize the `whitespace` analyzer then you need to +recreate it as a `custom` analyzer and modify it, usually by adding +token filters. This would recreate the built in `whitespace` analyzer +and you can use it as a starting point for further customization: + +[source,js] +---------------------------------------------------- +PUT /whitespace_example +{ + "settings": { + "analysis": { + "analyzer": { + "rebuilt_whitespace": { + "tokenizer": "whitespace", + "filter": [ <1> + ] + } + } + } + } +} +---------------------------------------------------- +// CONSOLE +// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: whitespace_example, first: whitespace, second: rebuilt_whitespace}\nendyaml\n/] +<1> You'd add any token filters here.