Skip to content

Commit

Permalink
archive: use BytesIO to store data file
Browse files Browse the repository at this point in the history
When opening a sigmf archive with SigMFArchiveReader(), the data file is
currently set to the full archive (including metadata).

This causes issues when writing the archive back to disk and invalidates
the metadata hash since the data_file is now a tar archive and not just
the set of samples.

To work around the issue, carry around a BytesIO buffer with the content
of the data_file and write it to the tmpdir just before saving a new
archive to disk.

Signed-off-by: Liam Beguin <[email protected]>
  • Loading branch information
liambeguin committed Mar 31, 2024
1 parent bc73751 commit 747e162
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 11 deletions.
10 changes: 8 additions & 2 deletions sigmf/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"""Create and extract SigMF archives."""

import os
import io
import shutil
import tarfile
import tempfile
Expand Down Expand Up @@ -73,7 +74,12 @@ def __init__(self, sigmffile, name=None, fileobj=None):
with open(sigmf_md_path, "w") as mdfile:
self.sigmffile.dump(mdfile, pretty=True)

shutil.copy(self.sigmffile.data_file, sigmf_data_path)
if isinstance(self.sigmffile.data_buffer, io.BytesIO):
self.sigmffile.data_file = sigmf_data_path
with open(sigmf_data_path, 'wb') as f:
f.write(self.sigmffile.data_buffer.getbuffer())
else:
shutil.copy(self.sigmffile.data_file, sigmf_data_path)

def chmod(tarinfo):
if tarinfo.isdir():
Expand Down Expand Up @@ -111,7 +117,7 @@ def _ensure_name_has_correct_extension(self):
self.name = name if has_correct_extension else name + SIGMF_ARCHIVE_EXT

def _ensure_data_file_set(self):
if not self.sigmffile.data_file:
if not self.sigmffile.data_file and not isinstance(self.sigmffile.data_buffer, io.BytesIO):
err = "no data file - use `set_data_file`"
raise SigMFFileError(err)

Expand Down
8 changes: 5 additions & 3 deletions sigmf/archivereader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
"""Access SigMF archives without extracting them."""

import os
import io
import shutil
import tarfile
import tempfile
from pathlib import Path

from . import __version__ #, schema, sigmf_hash, validate
from .sigmffile import SigMFFile
Expand Down Expand Up @@ -64,6 +66,8 @@ def __init__(self, name=None, skip_checksum=False, map_readonly=True, archive_bu
elif memb.name.endswith(SIGMF_DATASET_EXT):
data_offset = memb.offset_data
data_size_bytes = memb.size
with tar_obj.extractfile(memb) as memb_fid:
data_buffer = io.BytesIO(memb_fid.read())

else:
print('A regular file', memb.name, 'was found but ignored in the archive')
Expand All @@ -77,10 +81,8 @@ def __init__(self, name=None, skip_checksum=False, map_readonly=True, archive_bu
valid_md = self.sigmffile.validate()

self.sigmffile.set_data_file(
self.name,
data_buffer=archive_buffer,
data_buffer=data_buffer,
skip_checksum=skip_checksum,
offset=data_offset,
size_bytes=data_size_bytes,
map_readonly=map_readonly,
)
Expand Down
19 changes: 13 additions & 6 deletions sigmf/sigmffile.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from os import path
import warnings
import numpy as np
import io

from . import __version__, schema, sigmf_hash, validate
from .archive import SigMFArchive, SIGMF_DATASET_EXT, SIGMF_METADATA_EXT, SIGMF_ARCHIVE_EXT, SIGMF_COLLECTION_EXT
Expand Down Expand Up @@ -245,7 +246,7 @@ def _is_conforming_dataset(self):
# check for any non-zero `header_bytes` fields in captures segments
if capture.get(self.HEADER_BYTES_KEY, 0):
return False
if not path.isfile(self.data_file):
if self.data_file is not None and not path.isfile(self.data_file):
return False
# if we get here, the file exists and is conforming
return True
Expand Down Expand Up @@ -630,7 +631,7 @@ def read_samples(self, start_index=0, count=-1, autoscale=True, raw_components=F
raise IOError('Number of samples must be greater than zero, or -1 for all samples.')
elif start_index + count > self.sample_count:
raise IOError("Cannot read beyond EOF.")
if self.data_file is None:
if self.data_file is None and not isinstance(self.data_buffer, io.BytesIO):
if self.get_global_field(self.METADATA_ONLY_KEY, False):
# only if data_file is `None` allows access to dynamically generated datsets
raise SigMFFileError("Cannot read samples from a metadata only distribution.")
Expand All @@ -657,9 +658,13 @@ def _read_datafile(self, first_byte, nitems, autoscale, raw_components):
data_type_out = np.dtype("f4") if not self.is_complex_data else np.dtype("f4, f4")
num_channels = self.get_num_channels()

fp = open(self.data_file, "rb")
fp.seek(first_byte, 0)
data = np.fromfile(fp, dtype=data_type_in, count=nitems)
if self.data_file is not None:
fp = open(self.data_file, "rb")
fp.seek(first_byte, 0)
data = np.fromfile(fp, dtype=data_type_in, count=nitems)
else:
data = self._memmap

if num_channels != 1:
# return reshaped view for num_channels
# first dimension will be double size if `is_complex_data`
Expand All @@ -677,7 +682,9 @@ def _read_datafile(self, first_byte, nitems, autoscale, raw_components):
else:
data = data.view(component_type_in)

fp.close()
if self.data_file is not None:
fp.close()

return data


Expand Down

0 comments on commit 747e162

Please sign in to comment.