Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some adjustments to simplify ingestion of data #158

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion geospaas/base_viewer/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@


class BaseViewerConfig(AppConfig):
name = 'base_viewer'
name = 'geospaas.base_viewer'
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def crawl_and_ingest(url, **options):
# Create Dataset from OPeNDAP url - this is necessary to get all metadata
gds, cr = NansatDataset.objects.get_or_create(url, uri_service_name=name,
uri_service_type=service)
except (IOError, AttributeError) as e:
except (IOError, AttributeError, ValueError) as e:
# warnings.warn(e.message)
continue
if cr:
Expand Down
40 changes: 30 additions & 10 deletions geospaas/nansat_ingestor/managers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import json
import logging
import uuid
import warnings

import pythesint as pti
from django.contrib.gis.geos import WKTReader
from django.db import connection
from django.db import models

from geospaas.catalog.managers import FILE_SERVICE_NAME, LOCAL_FILE_SERVICE
Expand Down Expand Up @@ -56,23 +57,37 @@ def get_or_create(self,
uris = DatasetURI.objects.filter(uri=uri, **uri_filter_args)
if len(uris) > 0:
return uris[0].dataset, False
connection.close()

# Open file with Nansat
n = Nansat(nansat_filename(uri), **kwargs)

# get metadata from Nansat and get objects from vocabularies
n_metadata = n.get_metadata()

entry_id = n_metadata.get('entry_id', None)
entry_id = n_metadata.get('entry_id', n_metadata.get('id', None))
# set compulsory metadata (source)
platform, _ = Platform.objects.get_or_create(
json.loads(n_metadata['platform']))
instrument, _ = Instrument.objects.get_or_create(
json.loads(n_metadata['instrument']))
pp = n_metadata['platform']
try:
pp_dict = json.loads(pp)
except json.JSONDecodeError:
pp_entry = [elem.strip() for elem in pp.split('>')]
pp_dict = pti.get_gcmd_platform(pp_entry[-1].split('(')[1][0:-1])
platform, _ = Platform.objects.get_or_create(pp_dict)
connection.close()
ii = n_metadata['instrument']
try:
ii_dict = json.loads(ii)
except json.JSONDecodeError:
ii_entry = [elem.strip() for elem in ii.split('>')]
ii_dict = pti.get_gcmd_instrument(ii_entry[-1].split('(')[1][0:-1])
instrument, _ = Instrument.objects.get_or_create(ii_dict)
connection.close()
specs = n_metadata.get('specs', '')
source, _ = Source.objects.get_or_create(platform=platform,
instrument=instrument,
specs=specs)
connection.close()

default_char_fields = {
# Adding NERSC_ in front of the id violates the string representation of the uuid
Expand All @@ -88,9 +103,10 @@ def get_or_create(self,
existing_ds = Dataset.objects.get(entry_id=entry_id)
except Dataset.DoesNotExist:
existing_ds = None
connection.close()
for name in default_char_fields:
if name not in n_metadata:
warnings.warn('%s is not provided in Nansat metadata!' % name)
logging.debug('%s is not provided in Nansat metadata!' % name)
# prevent overwriting of existing values by defaults
if existing_ds:
options[name] = existing_ds.__getattribute__(name)
Expand All @@ -103,7 +119,7 @@ def get_or_create(self,
'gcmd_location': {'model': Location,
'value': pti.get_gcmd_location('SEA SURFACE')},
'data_center': {'model': DataCenter,
'value': pti.get_gcmd_provider('NERSC')},
'value': pti.get_gcmd_provider('NO/MET')},
'ISO_topic_category': {'model': ISOTopicCategory,
'value': pti.get_iso19115_topic_category('Oceans')},
}
Expand All @@ -113,12 +129,12 @@ def get_or_create(self,
value = default_foreign_keys[name]['value']
model = default_foreign_keys[name]['model']
if name not in n_metadata:
warnings.warn('%s is not provided in Nansat metadata!' % name)
logging.debug('%s is not provided in Nansat metadata!' % name)
else:
try:
value = json.loads(n_metadata[name])
except:
warnings.warn('%s value of %s metadata provided in Nansat is wrong!' %
logging.debug('%s value of %s metadata provided in Nansat is wrong!' %
(n_metadata[name], name))
if existing_ds:
options[name] = existing_ds.__getattribute__(name)
Expand All @@ -130,6 +146,7 @@ def get_or_create(self,
n.reproject_gcps()
geolocation = GeographicLocation.objects.get_or_create(
geometry=WKTReader().read(n.get_border_wkt(nPoints=n_points)))[0]
connection.close()

# create dataset
# - the get_or_create method should use get_or_create here as well,
Expand All @@ -145,6 +162,7 @@ def get_or_create(self,
'entry_title': options["entry_title"],
'summary': options["summary"]}
)
connection.close()

# create parameter
all_band_meta = n.bands()
Expand All @@ -162,12 +180,14 @@ def get_or_create(self,
params = params.filter(units=units)
if params.count() >= 1:
ds.parameters.add(params[0])
connection.close()

# create dataset URI
DatasetURI.objects.get_or_create(
name=uri_service_name,
service=uri_service_type,
uri=uri,
dataset=ds)
connection.close()

return ds, created
12 changes: 9 additions & 3 deletions geospaas/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,15 @@ def module_path(module, root):
return media_path


def path(module, filename, root):
def path(module, filename, root, date=None):
mp = module_path(module, root)

if date is not None:
for xx in [date.strftime('%Y'), date.strftime('%m'), date.strftime('%d')]:
mp = os.path.join(mp, xx)
if not os.path.exists(mp):
os.mkdir(mp)

# Get the path of media files created from <filename>
basename = os.path.split(filename)[-1].split('.')[0]
dataset_path = os.path.join(mp, basename)
Expand All @@ -39,8 +45,8 @@ def path(module, filename, root):
def media_path(module, filename):
return path(module, filename, settings.MEDIA_ROOT)

def product_path(module, filename):
return path(module, filename, settings.PRODUCTS_ROOT)
def product_path(module, filename, date=None):
return path(module, filename, settings.PRODUCTS_ROOT, date=date)

def validate_uri(uri):
""" Validation of URI and its existence
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,16 @@ def handle(self, *args, **options):
models = [
Parameter,
DataCenter,
HorizontalDataResolution,
#HorizontalDataResolution,
Instrument,
ISOTopicCategory,
Location,
Platform,
Project,
#Project,
ScienceKeyword,
TemporalDataResolution,
VerticalDataResolution]
#TemporalDataResolution,
#VerticalDataResolution,
]

for model in models:
model.objects.create_from_vocabularies(**options)
28 changes: 22 additions & 6 deletions geospaas/vocabularies/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,20 @@ def create_instances(self, pti_list):
"""
num = 0
for entry in pti_list:
pp, created = self.get_or_create(entry)
if bool(entry):
pp, created = self.get_or_create(entry)
else:
created = False
if created: num+=1
print("Successfully added %d new entries" % num)

def get_or_create(self, entry, *args, **kwargs):
""" Get or create database instance from input pythesint entry """

params = {key : entry[self.mapping[key]] for key in self.mapping}
try:
params = {key : entry[self.mapping[key]] for key in self.mapping}
except:
import ipdb
ipdb.set_trace()
return super(VocabularyManager, self).get_or_create(**params)

def create_from_vocabularies(self, force=False, **kwargs):
Expand Down Expand Up @@ -114,11 +120,21 @@ def get_by_natural_key(self, standard_name):
class PlatformManager(VocabularyManager):
get_list = pti.get_gcmd_platform_list
update = pti.update_gcmd_platform
mapping = dict(category='Category',
series_entity='Series_Entity',
# mapping = dict(category='Category',
# series_entity='Series_Entity',
# short_name='Short_Name',
# long_name='Long_Name')
# New mapping to adapt to changes in GCMD
mapping = dict(category='Basis',
series_entity='Category',
short_name='Short_Name',
long_name='Long_Name')

# Correct mapping needs update of tables..:
# mapping = dict(basis='Basis',
# category='Category',
# sub_category='Sub_Category',
# short_name='Short_Name',
# long_name='Long_Name')


class InstrumentManager(VocabularyManager):
Expand Down