Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade ES to 7.8 #133

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
19 changes: 9 additions & 10 deletions image_match/elasticsearch_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,24 +53,23 @@ def search_single_record(self, rec, pre_filter=None):
rec.pop('metadata')

# build the 'should' list
should = [{'term': {word: rec[word]}} for word in rec]
should = [{'term': {'{}.{}'.format(self.doc_type, word): rec[word]}} for word in rec]
body = {
'query': {
'bool': {'should': should}
},
'_source': {'excludes': ['simple_word_*']}
'_source': {'excludes': ['{}.simple_word_*'.format(self.doc_type)]}
}

if pre_filter is not None:
body['query']['bool']['filter'] = pre_filter

res = self.es.search(index=self.index,
doc_type=self.doc_type,
body=body,
size=self.size,
timeout=self.timeout)['hits']['hits']

sigs = np.array([x['_source']['signature'] for x in res])
sigs = np.array([x['_source'][self.doc_type]['signature'] for x in res])

if sigs.size == 0:
return []
Expand All @@ -79,8 +78,8 @@ def search_single_record(self, rec, pre_filter=None):

formatted_res = [{'id': x['_id'],
'score': x['_score'],
'metadata': x['_source'].get('metadata'),
'path': x['_source'].get('url', x['_source'].get('path'))}
'metadata': x['_source'][self.doc_type].get('metadata'),
'path': x['_source'][self.doc_type].get('url', x['_source'][self.doc_type].get('path'))}
for x in res]

for i, row in enumerate(formatted_res):
Expand All @@ -91,7 +90,7 @@ def search_single_record(self, rec, pre_filter=None):

def insert_single_record(self, rec, refresh_after=False):
rec['timestamp'] = datetime.now()
self.es.index(index=self.index, doc_type=self.doc_type, body=rec, refresh=refresh_after)
self.es.index(index=self.index, body={ self.doc_type: rec }, refresh=refresh_after)

def delete_duplicates(self, path):
"""Delete all but one entries in elasticsearch whose `path` value is equivalent to that of path.
Expand All @@ -101,11 +100,11 @@ def delete_duplicates(self, path):
matching_paths = [item['_id'] for item in
self.es.search(body={'query':
{'match':
{'path': path}
{'{}.path'.format(self.doc_type): path}
}
},
index=self.index)['hits']['hits']
if item['_source']['path'] == path]
if item['_source'][self.doc_type]['path'] == path]
if len(matching_paths) > 0:
for id_tag in matching_paths[1:]:
self.es.delete(index=self.index, doc_type=self.doc_type, id=id_tag)
self.es.delete(index=self.index, id=id_tag)
4 changes: 2 additions & 2 deletions image_match/goldberg.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,14 +236,14 @@ def preprocess_image(image_or_path, bytestream=False, handle_mpo=False):
return rgb2gray(np.asarray(img, dtype=np.uint8))
elif type(image_or_path) in string_types or \
type(image_or_path) is text_type:
return imread(image_or_path, as_grey=True)
return imread(image_or_path, as_gray=True)
elif type(image_or_path) is bytes:
try:
img = Image.open(image_or_path)
arr = np.array(img.convert('RGB'))
except IOError:
# try again due to PIL weirdness
return imread(image_or_path, as_grey=True)
return imread(image_or_path, as_gray=True)
if handle_mpo:
# take the first images from the MPO
if arr.shape == (2,) and isinstance(arr[1].tolist(), MpoImageFile):
Expand Down
2 changes: 1 addition & 1 deletion image_match/signature_database_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def search_single_record(self, rec, pre_filter=None):
before applying the matching strategy

For example:
{ "term": { "metadata.category": "art" } }
{ "term": { "image.metadata.category": "art" } }

Returns:
a formatted list of dicts representing matches.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def find_version(*file_paths):
],
install_requires=[
'scikit-image>=0.14',
'elasticsearch>=5.0.0,<6.0.0',
'elasticsearch>=7.0.0,<8.0.0',
'six>=1.11.0',
],
tests_require=tests_require,
Expand Down
34 changes: 19 additions & 15 deletions tests/test_elasticsearch_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,24 @@
INDEX_NAME = 'test_environment_{}'.format(hashlib.md5(os.urandom(128)).hexdigest()[:12])
DOC_TYPE = 'image'
MAPPINGS = {
"mappings": {
DOC_TYPE: {
"dynamic": True,
"properties": {
"metadata": {
"type": "object",
"dynamic": True,
"properties": {
"tenant_id": { "type": "keyword" }
"mappings": {
"properties": {
DOC_TYPE: {
"properties": {
"path": {
"type": "keyword"
},
"metadata": {
"properties": {
"tenant_id": {
"type": "keyword",
}
}
}
}
}
}
}
}
}
}


Expand All @@ -46,7 +50,7 @@ def setup_index(request, index_name):
try:
es.indices.create(index=index_name, body=MAPPINGS)
except RequestError as e:
if e.error == u'index_already_exists_exception':
if e.error == u'resource_already_exists_exception':
es.indices.delete(index_name)
else:
raise
Expand Down Expand Up @@ -189,15 +193,15 @@ def test_lookup_with_filter_by_metadata(ses):
)
ses.add_image('test2.jpg', metadata=metadata2, refresh_after=True)

r = ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "foo"}})
r = ses.search_image('test1.jpg', pre_filter={"term": {'{}.metadata.tenant_id'.format(DOC_TYPE): "foo"}})
assert len(r) == 1
assert r[0]['metadata'] == metadata

r = ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-2"}})
r = ses.search_image('test1.jpg', pre_filter={"term": {'{}.metadata.tenant_id'.format(DOC_TYPE): "bar-2"}})
assert len(r) == 1
assert r[0]['metadata'] == metadata2

r = ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-3"}})
r = ses.search_image('test1.jpg', pre_filter={"term": {'{}.metadata.tenant_id'.format(DOC_TYPE): "bar-3"}})
assert len(r) == 0


Expand Down
43 changes: 18 additions & 25 deletions tests/test_elasticsearch_driver_metadata_as_nested.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,23 @@
INDEX_NAME = 'test_environment_{}'.format(hashlib.md5(os.urandom(128)).hexdigest()[:12])
DOC_TYPE = 'image'
MAPPINGS = {
"mappings": {
DOC_TYPE: {
"dynamic": True,
"properties": {
"metadata": {
"type": "nested",
"dynamic": True,
"properties": {
"tenant_id": { "type": "keyword" },
"project_id": { "type": "keyword" }
"mappings": {
"properties": {
DOC_TYPE: {
"properties": {
"path": {
"type": "keyword"
},
"metadata": {
"properties": {
"tenant_id": { "type": "keyword" },
"project_id": { "type": "keyword" }
}
}
}
}
}
}
}
}
}


Expand Down Expand Up @@ -122,16 +124,7 @@ def _metadata(tenant_id, project_id):
)

def _nested_filter(tenant_id, project_id):
return {
"nested" : {
"path" : "metadata",
"query" : {
"bool" : {
"must" : [
{"term": {"metadata.tenant_id": tenant_id}},
{"term": {"metadata.project_id": project_id}}
]
}
}
}
}
return [
{"term": {"image.metadata.tenant_id": tenant_id}},
{"term": {"image.metadata.project_id": project_id}}
]