Skip to content

Commit

Permalink
Merge pull request #119 from NASA-PDS/i107-workaround-excision
Browse files Browse the repository at this point in the history
Resolve #107
  • Loading branch information
nutjob4life authored Oct 15, 2021
2 parents 870d808 + 51ed266 commit db42ba2
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 64 deletions.
9 changes: 1 addition & 8 deletions docs/source/usage/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,14 +181,7 @@ include:
tab-separated values file.
• ``insight_documents_v2.0_sip_v1.0_DATE.xml``, an PDS label for the SIP file.

As of this writing, there may be an issue with PDS Registry APIs
that may affect ``pds-deep-registry-archive``:

• A `pagination bug`_ may cause some performance issues when making deep
archives of large bundles. By default, ``pds-deep-registry-archive``
works around the bug—but if you know the PDS Registry you're using is
free of the bug, you can add ``--disable-pagination-workaround`` to the
command. It doesn't hurt if you use it regardless.
As with ``pds-deep-archive``, you can also specify ``--include-latest-collection-only`` to select if you want just the latest version of LID-only collections in your deep archive versus the default behavior of **all** versions of them.


PDS Delivery Checklist
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ install_requires =
lxml == 4.6.3
zope.component == 5.0.1
zope.interface == 5.4.0
pds.api-client == 0.5.0
pds.api-client == 0.8.0
# It's a bummer we can't use the "pds" namespace and have to use "pds2".
# See https://github.com/NASA-PDS/pds-api-client/issues/7 for why.
namespace_packages = pds2
Expand Down
190 changes: 135 additions & 55 deletions src/pds2/aipgen/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,33 @@
from .utils import addbundlearguments
from .utils import addloggingarguments

# Import entity classes: in this case we just need class ``Product``.
#
# 😛 Apparently this API changes with the phase of the moon. See, in some versions of pds.api-client,
# the name of the ``model`` package is ``model``, singular. But then seemingly at random, it becomes
# ``models`` plural. And even some releases support *both*. So here we try to accomodate whatever the
# flavor du jour is.
try:
from pds.api_client.model.product import Product # type: ignore
except ImportError:
from pds.api_client.models.product import Product # type: ignore

# If this fails to import, then we're using a pds.api-client ≤ 0.5.0, which I'm arbitrarily declaring "too old":
from pds.api_client.exceptions import ApiAttributeError # type: ignore

# Import functional endpoints.
#
# 😛 Apparently this API changes more more frequently than a fringe politician's platform. See, in
# some versions of pds.api-client, the endpoint classes are importable directly from ``pds.api_client``.
# And in other releases, they're not. And it seems to swap randomly. So here we try to be resilient
# to whatever the pds.api-client we get stuck with.
try:
from pds.api_client import CollectionsProductsApi, BundlesCollectionsApi, BundlesApi # type: ignore
except ImportError:
from pds.api_client.api.bundles_api import BundlesApi # type: ignore
from pds.api_client.api.bundles_collections_api import BundlesCollectionsApi # type: ignore
from pds.api_client.api.collections_products_api import CollectionsProductsApi # type: ignore


# Constants
# =========
Expand Down Expand Up @@ -122,7 +149,7 @@ def _makefilename(lidvid: str, ts: datetime, kind: str, ext: str) -> str:
return f"{lid}_v{vid}_{slate}_{kind}_v{AIP_SIP_DEFAULT_VERSION}{ext}"


def _getbundle(apiclient: pds.api_client.ApiClient, lidvid: str) -> pds.api_client.models.product.Product:
def _getbundle(apiclient: pds.api_client.ApiClient, lidvid: str) -> Product:
"""Get a bundle.
Using the PDS ``apiclient`` find the PDS bundle with the named ``lidvid`` and return it not as
Expand All @@ -131,7 +158,7 @@ def _getbundle(apiclient: pds.api_client.ApiClient, lidvid: str) -> pds.api_clie
"""
try:
_logger.debug("⚙️ Asking ``bundle_by_lidvid`` for %s", lidvid)
bundles = pds.api_client.BundlesApi(apiclient)
bundles = BundlesApi(apiclient)
return bundles.bundle_by_lidvid(lidvid) # type = ``Product_Bundle``
except pds.api_client.exceptions.ApiException as ex:
if ex.status == http.client.NOT_FOUND:
Expand All @@ -140,28 +167,48 @@ def _getbundle(apiclient: pds.api_client.ApiClient, lidvid: str) -> pds.api_clie
raise


def _getcollections(apiclient: pds.api_client.ApiClient, lidvid: str, workaroundpaginationbug=True):
def _getcollections(apiclient: pds.api_client.ApiClient, lidvid: str, allcollections=True):
"""Get the collections.
Using the PDS ``apiclient`` generate collections that belong to the PDS bundle ``lidvid``.
If ``workaroundpaginationbug`` is True, avoid the bug in the count of items returned from the
``/bundles/{lidvid}/collections`` endpoint; see NASA-PDS/pds-api#73.
If ``allcollections`` is True, then return all collections for LID-only references; otherwise
return just the latest collection for LID-only references (has no effect on full LIDVID-references.
"""
bcapi, start = pds.api_client.BundlesCollectionsApi(apiclient), 0
bcapi, start = BundlesCollectionsApi(apiclient), 0
while True:
limit = _apiquerylimit - 1 if workaroundpaginationbug else _apiquerylimit
_logger.debug("⚙️ Asking ``collections_of_a_bundle`` for %s at %d limit %d", lidvid, start, limit)
results = bcapi.collections_of_a_bundle(lidvid, start=start, limit=limit, fields=_fields)
if results.data is None:
return
_logger.debug('⚙️ Asking ``collections_of_a_bundle`` for %s at %d limit %d', lidvid, start, _apiquerylimit)

try:
if allcollections:
results = bcapi.collections_of_a_bundle_all(lidvid, start=start, limit=_apiquerylimit, fields=_fields)
else:
results = bcapi.collections_of_a_bundle_latest(
lidvid, start=start, limit=_apiquerylimit, fields=_fields
)
except AttributeError:
msg = '☡ Warning: the all+latest collections_of_a_bundle API is missing, reverting to older behavior'
_logger.warning(msg)
results = bcapi.collections_of_a_bundle(lidvid, start=start, limit=_apiquerylimit, fields=_fields)

# 😛 Apparently this API changes more often than a newborn's nappies. See, in some releases of
# pds.api-client, ``data`` is an attribute on ``results``. And in other versions, it's an indexed
# element of ``results``. What you get is pretty random! So here we try to be resilient to whatever
# the "soup of the day" is with pds.api-client.
try:
if results.data is None:
return
except ApiAttributeError:
if 'data' not in results:
return
start += len(results.data)
for i in results.data:
yield i


def _getproducts(apiclient: pds.api_client.ApiClient, lidvid: str):
"""Using the PDS ``apiclient`` generate PDS products that belong to the collection ``lidvid``."""
cpapi, start = pds.api_client.CollectionsProductsApi(apiclient), 0
cpapi, start = CollectionsProductsApi(apiclient), 0
while True:
try:
_logger.debug("⚙️ Asking ``products_of_a_collection`` for %s at %d limit %d", lidvid, start, _apiquerylimit)
Expand All @@ -171,29 +218,65 @@ def _getproducts(apiclient: pds.api_client.ApiClient, lidvid: str):
return
else:
raise
if results.data is None:
return

# 😛 Apparently this API changes faster than Superman in a phone booth. See, in some releases of
# pds.api-client, ``data`` is an indexed element of ``results``, and in others it's a named
# attribute of ``results``. What is it today? No one can tell, so here we try to be flexible
# to whatever shape it gives us.
try:
if results.data is None:
return
except ApiAttributeError:
if 'data' not in results:
return

start += len(results.data)
for i in results.data:
yield i


def _addfiles(product: pds.api_client.models.Product, bac: dict):
def _addfiles(product: Product, bac: dict):
"""Add the PDS files described in the PDS ``product`` to the ``bac``."""
lidvid, props = product.id, product.properties # Shorthand
# 😛 Apparently this API changes as frequently as my knickers. See, in some releases of pds.api-client,
# ``Product`` entity objects have two named attributes, ``id`` and ``properties``. But then sometimes,
# and for apparently random reasons, ``id`` and ``properties`` become indexed elements of a ``Product``.
# So, we try to accommodate whatever the flavor du jour is.
try:
lidvid, props = product['id'], product['properties']
except TypeError:
lidvid, props = product.id, product.properties

files = bac.get(lidvid, set()) # Get the current set (or a new empty set)

if _propdataurl in props: # Are there data files in the product?
urls, md5s = props[_propdataurl], props[_propdatamd5] # Get the URLs and MD5s of them
for url, md5 in zip(urls, md5s): # For each URL and matching MD5
files.add(_File(url, md5)) # Add it to the set
if _proplabelurl in props: # How about the label itself?
files.add(_File(props[_proplabelurl][0], props[_proplabelmd5][0])) # Add it too
# 😛 Apparently this API changes depending on the day of the week. See, in some releases of
# pds.api-client, the URLs and MD5s are directly two sequences of the properties. And in other
# releases, they're sequences of the ``value`` element of the properties. Why? Who knows! We
# jump through this extra try…except block here so we can work with whatever the pds.api-client
# decides to be that day.
try:
urls, md5s = props[_propdataurl], props[_propdatamd5] # Get the URLs and MD5s of them
for url, md5 in zip(urls, md5s): # For each URL and matching MD5
files.add(_File(url, md5)) # Add it to the set
except ApiAttributeError:
urls, md5s = props[_propdataurl]['value'], props[_propdatamd5]['value'] # Get the URLs and MD5s of them
for url, md5 in zip(urls, md5s):
files.add(_File(url, md5))

# 😛 Apparently this API changes faster than Coinstar™. For the same reason above, sometimes the
# URL and MD5 sequences are directly accessible from the properties, and sometimes they're in a
# ``value`` element of properties. Whew!
try:
if _proplabelurl in props: # How about the label itself?
files.add(_File(props[_proplabelurl][0], props[_proplabelmd5][0])) # Add it too
except ApiAttributeError:
if _proplabelurl in props: # How about the label itself?
files.add(_File(props[_proplabelurl]['value'][0], props[_proplabelmd5]['value'][0])) # Add it too

bac[lidvid] = files # Stash for future use


def _comprehendregistry(
url: str, bundlelidvid: str, allcollections=True, workaroundpaginationbug=True
) -> tuple[int, dict, str]:
def _comprehendregistry(url: str, bundlelidvid: str, allcollections=True) -> tuple[int, dict, str]:
"""Fathom the registry.
Query the PDS API at ``url`` for all information about the PDS ``bundlelidvid`` and return a
Expand All @@ -203,7 +286,10 @@ def _comprehendregistry(
within it, the "B.A.C." (a dict mapping PDS lidvids to sets of ``_File``s), and the title of
the PDS bundle.
Note: currently ``allcollections`` is ignored; see NASA-PDS/pds-api#74.
If ``allcollections`` is True, we include all collections, meaning that if a bundle references
a collection with LID only (no VID), we include all version IDs of that collection. When this
flag ``allcollections`` is False, then we include only the *latest* collection for a LID-only
reference.
"""
_logger.debug("🤔 Comprehending the registry at %s for %s", url, bundlelidvid)

Expand All @@ -219,15 +305,32 @@ def _comprehendregistry(
bundle = _getbundle(apiclient, bundlelidvid) # There's no class "Bundle" but class Product 🤷‍♀️
if bundle is None:
raise ValueError(f"🤷‍♀️ The bundle {bundlelidvid} cannot be found in the registry at {url}")
title = bundle.title if bundle.title else "«unknown»"

# 😛 Did I mention this API changes **a lot?**
#
# The pds-api.client is pretty fickle between each release: sometimes ``title`` is an indexed value
# of the ``bundle``, and sometimes it's a named attribute of the bundle. The try…except block here
# handles both cases.
try:
title = bundle.get('title', '«unknown»')
except AttributeError:
title = bundle.title if bundle.title else '«unknown»'

_addfiles(bundle, bac)

bundleurl = bundle.metadata.label_url
# 😛 I'm sure I mentioned it by now!
#
# Ditto the above comment, but for ``metadata``'s ``label_url'.
try:
bundleurl = bundle['metadata']['label_url']
except TypeError:
bundleurl = bundle.metadata.label_url

prefixlen = bundleurl.rfind("/") + 1

# It turns out the PDS registry makes this *trivial* compared to the PDS filesystem version;
# Just understanding it all was there was the hard part! 😊 THANK YOU! 🙏
for collection in _getcollections(apiclient, bundlelidvid, workaroundpaginationbug):
for collection in _getcollections(apiclient, bundlelidvid, allcollections):
_addfiles(collection, bac)
for product in _getproducts(apiclient, collection.id):
_addfiles(product, bac)
Expand Down Expand Up @@ -331,23 +434,21 @@ def _writesip(bundlelidvid: str, bac: dict, title: str, site: str, ts: datetime,
writesiplabel(lid, vid, title, hashish.hexdigest(), size, count, "MD5", sipfn, site, o, cmmd5, ts)


def generatedeeparchive(url: str, bundlelidvid: str, site: str, allcollections=True, workaroundpaginationbug=True):
def generatedeeparchive(url: str, bundlelidvid: str, site: str, allcollections=True):
"""Make a PDS "deep archive" 🧘 in the current directory.
A PDS "deep archive" 🧘‍♀️ (consisting of the Archive Information Package's transfer manifest and
checksum manifest, and the Submission Information Package's table file—plus their corresponding
labels) for the named PDS bundle identified by ``bundlelidvid``, for the PDS ``site``, using knowledge
in the PDS Registry at ``url``, including ``allcollections`` if True else just the latest collection
for PDS bundles that reference collections by logical identifier only, and neatly avoiding a PDS bug,
namely the ``workaroundpaginationbug`` if True if a certain PDS registry endpoint doens't handle
PDS pagination right.
for PDS bundles that reference collections by logical identifier only.
"""
# When is happening? Make a timestamp and remove the timezone info
ts = datetime.utcnow()
ts = datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute, ts.second, microsecond=0, tzinfo=None)

# Figure out what we're dealing with
prefixlen, bac, title = _comprehendregistry(url, bundlelidvid, allcollections, workaroundpaginationbug)
prefixlen, bac, title = _comprehendregistry(url, bundlelidvid, allcollections)

# Make it rain ☔️
cmmd5 = _writeaip(bundlelidvid, prefixlen, bac, ts)
Expand All @@ -366,34 +467,13 @@ def main():
parser.add_argument(
"-s", "--site", required=True, choices=PROVIDER_SITE_IDS, help="Provider site ID for the manifest's label"
)
parser.add_argument(
"--disable-pagination-workaround",
action="store_true",
help="By default, this program will sidestep an issue in the PDS Registry that treats pagination "
'of results from the "collections of a bundle query" as being off by one item; specifiy this option '
"disables this workaround—see https://github.com/NASA-PDS/pds-api/issues/73 for more information",
)
parser.add_argument("bundle", help="LIDVID of the PDS bundle for which to create a PDS Deep Archive")
args = parser.parse_args()
logging.basicConfig(level=args.loglevel, format="%(levelname)s %(message)s")
_logger.info("👟 PDS Deep Registry-based Archive, version %s", __version__)
_logger.debug("💢 command line args = %r", args)
if args.include_latest_collection_only:
_logger.critical(
"🙇 SORRY! Including only the latest collection is not yet supported! Please see "
"https://github.com/NASA-PDS/pds-api/issues/74 and for now re-run without --include-latest-collection-only"
)
sys.exit(1)

_logger.debug("%r", args)
try:
generatedeeparchive(
args.url,
args.bundle,
args.site,
not args.include_latest_collection_only,
not args.disable_pagination_workaround,
)
generatedeeparchive(args.url, args.bundle, args.site, not args.include_latest_collection_only)
except pds.api_client.exceptions.ApiException as ex:
if ex.status == http.client.INTERNAL_SERVER_ERROR:
_logger.critical(
Expand Down

0 comments on commit db42ba2

Please sign in to comment.