-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add
phone
& phone-search
analyzer + tokenizer
this is largely based on [elasticsearch-phone] and internally uses [libphonenumber]. this intentionally only ports a subset of the features: only `phone` and `phone-search` are supported right now, `phone-email` can be added if/when there's a clear need for it. this allows defining the region to be used when analysing a phone number. so far only the generic "unkown" region (`ZZ`) had been used which worked as long as international numbers were prefixed with `+` but did not work when using local numbers (e.g. a number stored as `+4158...` was not matched against a number entered as `004158...` or `058...`). example configuration for an index: ```json { "index": { "analysis": { "analyzer": { "phone": { "type": "phone" }, "phone-search": { "type": "phone-search" }, "phone-ch": { "type": "phone", "phone-region": "CH" }, "phone-search-ch": { "type": "phone-search", "phone-region": "CH" } } } } } ``` this creates four analyzers: `phone` and `phone-search` which do not explicitly specify a region and thus fall back to `ZZ` (unknown region, regional version of international dialing prefix (e.g. `00` instead of `+` in most of europe) will not be recognised) and `phone-ch` and `phone-search-ch` which will try to parse the phone number as a swiss phone number (thus e.g. `00` as a prefix is recognised as the international dialing prefix). closes #11326 [elasticsearch-phone]: https://github.com/purecloudlabs/elasticsearch-phone [libphonenumber]: https://github.com/google/libphonenumber Signed-off-by: Ralph Ursprung <[email protected]>
- Loading branch information
Showing
8 changed files
with
529 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
52 changes: 52 additions & 0 deletions
52
...les/analysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberAnalyzer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
*/ | ||
|
||
package org.opensearch.analysis.common; | ||
|
||
import org.apache.lucene.analysis.Analyzer; | ||
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter; | ||
import org.opensearch.common.settings.Settings; | ||
|
||
/** | ||
* Analyzer for phone numbers, using {@link PhoneNumberTermTokenizer}. | ||
* | ||
* <p> | ||
* You can use the {@code phone} and {@code phone-search} analyzers on your fields to index phone numbers. | ||
* Use {@code phone} (which creates ngrams) for the {@code analyzer} and {@code phone-search} (which doesn't create ngrams) | ||
* for the {@code search_analyzer}. | ||
* </p> | ||
* | ||
* <p> | ||
* You optionally can specify a region with the {@code phone-region} setting for the phone number which will ensure that | ||
* phone numbers without the international dialling prefix (using {@code +}) are also tokenized correctly. | ||
* </p> | ||
* | ||
* <p> | ||
* Note that the tokens will not refer to a specific position in the stream as the tokenizer is expected to be used on strings | ||
* containing phone numbers and not arbitrary text with interspersed phone numbers. | ||
* </p> | ||
*/ | ||
public class PhoneNumberAnalyzer extends Analyzer { | ||
private final boolean addNgrams; | ||
private final Settings settings; | ||
|
||
/** | ||
* @param addNgrams defines whether ngrams for the phone number should be added. Set to true for indexing and false for search. | ||
* @param settings the settings for the analyzer. | ||
*/ | ||
public PhoneNumberAnalyzer(final Settings settings, final boolean addNgrams) { | ||
this.addNgrams = addNgrams; | ||
this.settings = settings; | ||
} | ||
|
||
@Override | ||
protected TokenStreamComponents createComponents(String fieldName) { | ||
final var tokenizer = new PhoneNumberTermTokenizer(this.settings, this.addNgrams); | ||
return new Analyzer.TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(tokenizer)); | ||
} | ||
} |
31 changes: 31 additions & 0 deletions
31
...ysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberAnalyzerProvider.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
*/ | ||
|
||
package org.opensearch.analysis.common; | ||
|
||
import org.opensearch.common.settings.Settings; | ||
import org.opensearch.index.IndexSettings; | ||
import org.opensearch.index.analysis.AbstractIndexAnalyzerProvider; | ||
|
||
/** | ||
* Provider for {@link PhoneNumberAnalyzer}. | ||
*/ | ||
public class PhoneNumberAnalyzerProvider extends AbstractIndexAnalyzerProvider<PhoneNumberAnalyzer> { | ||
|
||
private final PhoneNumberAnalyzer analyzer; | ||
|
||
public PhoneNumberAnalyzerProvider(final IndexSettings indexSettings, final String name, final Settings settings, final boolean addNgrams) { | ||
super(indexSettings, name, settings); | ||
this.analyzer = new PhoneNumberAnalyzer(settings, addNgrams); | ||
} | ||
|
||
@Override | ||
public PhoneNumberAnalyzer get() { | ||
return this.analyzer; | ||
} | ||
} |
147 changes: 147 additions & 0 deletions
147
...nalysis-common/src/main/java/org/opensearch/analysis/common/PhoneNumberTermTokenizer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
*/ | ||
|
||
package org.opensearch.analysis.common; | ||
|
||
import com.google.i18n.phonenumbers.NumberParseException; | ||
import com.google.i18n.phonenumbers.PhoneNumberUtil; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | ||
import org.opensearch.common.io.Streams; | ||
import org.opensearch.common.settings.Settings; | ||
import org.opensearch.core.common.Strings; | ||
|
||
import java.io.IOException; | ||
import java.util.HashSet; | ||
import java.util.Iterator; | ||
import java.util.Optional; | ||
import java.util.Set; | ||
|
||
/** | ||
* This tokenizes a phone number into its individual parts, using {@link PhoneNumberUtil}. | ||
* | ||
* <p> | ||
* You can use the {@code phone} and {@code phone-search} analyzers on your fields to index phone numbers. | ||
* Use {@code phone} (which creates ngrams) for the {@code analyzer} and {@code phone-search} (which doesn't create ngrams) | ||
* for the {@code search_analyzer}. | ||
* </p> | ||
* | ||
* <p> | ||
* You optionally can specify a region with the {@code phone-region} setting for the phone number which will ensure that | ||
* phone numbers without the international dialling prefix (using {@code +}) are also tokenized correctly. | ||
* </p> | ||
* | ||
* <p> | ||
* Note that the tokens will not refer to a specific position in the stream as the tokenizer is expected to be used on strings | ||
* containing phone numbers and not arbitrary text with interspersed phone numbers. | ||
* </p> | ||
*/ | ||
public final class PhoneNumberTermTokenizer extends Tokenizer { | ||
private final boolean addNgrams; | ||
private final Settings settings; | ||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | ||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); | ||
private Iterator<String> tokenIterator; | ||
|
||
/** | ||
* @param addNgrams defines whether ngrams for the phone number should be added. Set to true for indexing and false for search. | ||
* @param settings the settings for the analyzer. | ||
*/ | ||
public PhoneNumberTermTokenizer(final Settings settings, final boolean addNgrams) { | ||
super(); | ||
this.addNgrams = addNgrams; | ||
this.settings = settings; | ||
} | ||
|
||
@Override | ||
public void reset() throws IOException { | ||
super.reset(); | ||
tokenIterator = null; | ||
} | ||
|
||
@Override | ||
public boolean incrementToken() throws IOException { | ||
clearAttributes(); | ||
if (tokenIterator == null) { | ||
tokenIterator = getTokens().iterator(); | ||
} | ||
if (tokenIterator.hasNext()) { | ||
termAtt.append(tokenIterator.next()); | ||
posIncrAtt.setPositionIncrement(0); | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
private Set<String> getTokens() throws IOException { | ||
final var tokens = new HashSet<String>(); | ||
|
||
var input = Streams.copyToString(this.input); | ||
|
||
tokens.add(input); | ||
|
||
// Rip off the "tel:" or "sip:" prefix | ||
if (input.indexOf("tel:") == 0 || input.indexOf("sip:") == 0) { | ||
tokens.add(input.substring(0, 4)); | ||
input = input.substring(4); | ||
} | ||
|
||
final var startIndex = input.startsWith("+") ? 1 : 0; | ||
// Add the complete input but skip a leading + | ||
tokens.add(input.substring(startIndex)); | ||
|
||
// Drop anything after @. Most likely there's nothing of interest | ||
final var posAt = input.indexOf('@'); | ||
if (posAt != -1) { | ||
input = input.substring(0, posAt); | ||
|
||
// Add a token for the raw unmanipulated address. Note this could be a username (sip) instead of telephone | ||
// number so take it as is | ||
tokens.add(input.substring(startIndex)); | ||
} | ||
|
||
// Let google's libphone try to parse it | ||
final var phoneUtil = PhoneNumberUtil.getInstance(); | ||
Optional<String> countryCode = Optional.empty(); | ||
try { | ||
// ZZ is the generic "I don't know the country code" region. Google's libphone library will try to infer it. | ||
final var region = this.settings.get("phone-region", "ZZ"); | ||
final var numberProto = phoneUtil.parse(input, region); | ||
if (numberProto != null) { | ||
// Libphone likes it! | ||
countryCode = Optional.of(String.valueOf(numberProto.getCountryCode())); | ||
input = String.valueOf(numberProto.getNationalNumber()); | ||
|
||
// Add Country code, extension, and the number as tokens | ||
tokens.add(countryCode.get()); | ||
tokens.add(countryCode.get() + input); | ||
if (!Strings.isEmpty(numberProto.getExtension())) { | ||
tokens.add(numberProto.getExtension()); | ||
} | ||
|
||
tokens.add(input); | ||
} | ||
} catch (final NumberParseException | StringIndexOutOfBoundsException e) { | ||
// Libphone didn't like it, no biggie. We'll just ngram the number as it is. | ||
} | ||
|
||
// ngram the phone number, e.g. 19198243333 produces 9, 91, 919, etc | ||
if (this.addNgrams && Strings.isDigits(input)) { | ||
for (int count = 1; count <= input.length(); ++count) { | ||
final var token = input.substring(0, count); | ||
tokens.add(token); | ||
// If there was a country code, add more ngrams such that 19198243333 produces 19, 191, 1919, etc | ||
countryCode.ifPresent(s -> tokens.add(s + token)); | ||
} | ||
} | ||
|
||
return tokens; | ||
} | ||
|
||
} |
33 changes: 33 additions & 0 deletions
33
...-common/src/main/java/org/opensearch/analysis/common/PhoneNumberTermTokenizerFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
/* | ||
* SPDX-License-Identifier: Apache-2.0 | ||
* | ||
* The OpenSearch Contributors require contributions made to | ||
* this file be licensed under the Apache-2.0 license or a | ||
* compatible open source license. | ||
*/ | ||
|
||
package org.opensearch.analysis.common; | ||
|
||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.opensearch.common.settings.Settings; | ||
import org.opensearch.index.IndexSettings; | ||
import org.opensearch.index.analysis.AbstractTokenizerFactory; | ||
|
||
/** | ||
* Factory for {@link PhoneNumberTermTokenizer}. | ||
*/ | ||
public class PhoneNumberTermTokenizerFactory extends AbstractTokenizerFactory { | ||
private final Settings settings; | ||
private final boolean addNgrams; | ||
|
||
public PhoneNumberTermTokenizerFactory(final IndexSettings indexSettings, final String name, final Settings settings, final boolean addNgrams) { | ||
super(indexSettings, settings, name); | ||
this.settings = settings; | ||
this.addNgrams = addNgrams; | ||
} | ||
|
||
@Override | ||
public Tokenizer create() { | ||
return new PhoneNumberTermTokenizer(this.settings, this.addNgrams); | ||
} | ||
} |
Oops, something went wrong.