Skip to content

Commit

Permalink
add kompress, support reading compressed json files
Browse files Browse the repository at this point in the history
  • Loading branch information
purarue committed Feb 10, 2024
1 parent 734bc46 commit beae3c9
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 33 deletions.
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,17 @@ In addition to `.json` files, this can parse `.jsonl` ([JSON lines](http://jsonl
browserexport merge --stream --json ~/data/browsing/*.sqlite > ./history.jsonl
```
_Additionally_, this can parse gzipped versions of those files - files like `history.json.gz` or `history.jsonl.gz`
_Additionally_, this can parse compressed JSON/JSONL files (using [kompress](https://github.com/karlicoss/kompress/)): `.xz`, `.zip`, `.lz4`, `.zstd`, `.zst`, `.tar.gz`, `.gz`
If you don't care about keeping the raw databases for any other auxiliary info like form, bookmark data, or [from_visit](https://github.com/seanbreckenridge/browserexport/issues/30) info and just want the URL, visit date and metadata, you could use `merge` to periodically merge the bulky `.sqlite` files into a gzipped JSONL dump:
For example, you could do:
```bash
browserexport merge --stream --json ~/data/browsing/*.sqlite | gzip --best > ./history.jsonl.gz
# test parsing the compressed file
browserexport --debug inspect ./history.jsonl.gz
```
If you don't care about keeping the raw databases for any other auxiliary info like form, bookmark data, or [from_visit](https://github.com/seanbreckenridge/browserexport/issues/30) info and just want the URL, visit date and metadata, you could use `merge` to periodically merge the bulky `.sqlite` files into a gzipped JSONL dump to reduce storage space, and improve parsing speed:
```bash
# backup databases
Expand Down
8 changes: 5 additions & 3 deletions browserexport/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,11 @@ def _wrapped_browser_list() -> str:


@cli.command(
epilog="For a list of all browsers, run 'LIST_BROWSERS=1 browserexport save --help'"
if not LIST_BROWSERS
else None,
epilog=(
"For a list of all browsers, run 'LIST_BROWSERS=1 browserexport save --help'"
if not LIST_BROWSERS
else None
),
)
@click.option(
"-b",
Expand Down
10 changes: 7 additions & 3 deletions browserexport/browsers/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from functools import lru_cache
from datetime import datetime, timezone
from typing import (
Generator,
List,
Iterator,
Optional,
Expand Down Expand Up @@ -100,9 +101,12 @@ def from_datetime_microseconds(ts: int) -> datetime:


def handle_glob(bases: Sequence[Path], stem: str, recursive: bool = False) -> Path:
dbs: List[Path]
method = Path.rglob if recursive else Path.glob
dbs = list(chain(*[method(base, stem) for base in bases]))
glob_itrs: List[Generator[Path, None, None]]
if recursive: # bleh, split like this to make mypy happy
glob_itrs = [base.rglob(stem) for base in bases]
else:
glob_itrs = [base.glob(stem) for base in bases]
dbs: List[Path] = list(chain(*glob_itrs))
recur_desc = "recursive" if recursive else "non recursive"
logger.debug(f"Glob {bases} with {stem} ({recur_desc}) matched {dbs}")
if len(dbs) > 1:
Expand Down
44 changes: 19 additions & 25 deletions browserexport/parse.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import os
import sqlite3
import tempfile
import shutil
from pathlib import Path
from typing import Iterator, List, Any, Dict, TextIO, Optional, Type, BinaryIO

from kompress import is_compressed, CPath

from .common import PathIshOrConn, PathIsh, expand_path, BrowserexportError
from .model import Visit
from .log import logger
Expand All @@ -29,7 +32,7 @@ def _read_json_obj(path: TextIO) -> Iterator[Dict[str, Any]]:


def _read_json_file(path: PathIsh) -> Iterator[Dict[str, Any]]:
with expand_path(path).open("r") as fp:
with expand_path(path).open() as fp:
yield from _read_json_obj(fp)


Expand All @@ -47,39 +50,30 @@ def _read_json_lines(fp: TextIO) -> Iterator[Dict[str, Any]]:
yield json.loads(line)


KNOWN_FORMATS = [".json", ".jsonl", ".json.gz", ".jsonl.gz"]
JSON_FORMATS = [".json", ".jsonl"]


def _detect_extensions(path: PathIsh) -> str:
basepath: str = os.path.basename(str(path))
if is_compressed(basepath):
basepath, _, _ = basepath.rpartition(".")

def _is_known_format(path: PathIsh) -> bool:
pth = expand_path(path)
return any(pth.name.endswith(ext) for ext in KNOWN_FORMATS)
_, primary_ext = os.path.splitext(basepath)
return primary_ext


def _parse_known_formats(path: PathIsh) -> Iterator[Visit]:
pth = expand_path(path)
ext = pth.suffix.lower()
pth: Path = CPath(expand_path(path)) # type: ignore
ext = _detect_extensions(path)
if ext not in JSON_FORMATS:
raise ValueError(f"Unknown filetype: {path} extension={ext}")
if ext == ".json":
logger.debug("Reading as JSON")
yield from map(Visit.from_dict, _read_json_file(pth))
elif ext == ".jsonl":
else:
assert ext == ".jsonl", f"Unknown extension {ext}"
logger.debug("Reading as JSON lines")
with pth.open("r") as fp:
yield from map(Visit.from_dict, _read_json_lines(fp))
elif ext == ".gz":
import gzip

if pth.name.endswith(".json.gz"):
logger.debug("Reading as gzipped JSON")
with gzip.open(pth, "rt") as fp:
yield from map(Visit.from_dict, _read_json_obj(fp))
elif pth.name.endswith(".jsonl.gz"):
logger.debug("Reading as gzipped JSON lines")
with gzip.open(path, "rt") as fp:
yield from map(Visit.from_dict, _read_json_lines(fp))
else:
raise ValueError(f"Unknown filetype: {path}")
else:
raise ValueError(f"Unknown filetype: {path}")


def _read_buf_as_sqlite_db(buf: BinaryIO) -> sqlite3.Connection:
Expand Down Expand Up @@ -121,7 +115,7 @@ def read_visits(
browsers += DEFAULT_BROWSERS
logger.info(f"Reading visits from {path}...")

if isinstance(path, (str, Path)) and _is_known_format(path):
if isinstance(path, (str, Path)) and _detect_extensions(path) in JSON_FORMATS:
logger.debug("Detected merged JSON file, mapping to Visit directly")
try:
yield from _parse_known_formats(path)
Expand Down
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@ classifiers =
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11
Programming Language :: Python :: 3.12
keywords = backup, brave, chrome, chromium, database, export, firefox, google-chrome, history, palemoon, safari, vivaldi, waterfox

[options]
packages = find_namespace:
install_requires =
click>=8.1
kompress>=0.1.20231016
logzero
sqlite-backup>=0.1.7
python_requires = >=3.8
Expand Down
Binary file added tests/databases/merged_dump.json.xz
Binary file not shown.
14 changes: 14 additions & 0 deletions tests/test_browserexport.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,13 @@ def test_read_jsonl_gz(jsonl_gz_dump: Path) -> None:
assert json_vis[1].url == "https://github.com/junegunn/fzf#installation"


def test_reading_xz_file(xz_file: Path) -> None:
assert xz_file.name.endswith(".json.xz")
json_vis = list(read_visits(xz_file))
assert len(json_vis) == 1
assert json_vis[0].url == "https://github.com/junegunn/fzf"


def test_mixed_read(json_dump: Path, firefox: Path) -> None:
jvis = list(read_visits(json_dump))
fvisits = list(read_visits(firefox))
Expand Down Expand Up @@ -245,6 +252,13 @@ def json_gz_dump() -> Iterator[Path]:
yield p


@pytest.fixture()
def xz_file() -> Iterator[Path]:
p = databases_dir / "merged_dump.json.xz"
assert p.exists()
yield p


@pytest.fixture()
def jsonl_gz_dump() -> Iterator[Path]:
p = databases_dir / "merged_dump.jsonl.gz"
Expand Down
12 changes: 12 additions & 0 deletions tests/test_package.py
Original file line number Diff line number Diff line change
@@ -1 +1,13 @@
from browserexport.model import test_make_metadata
from browserexport.parse import _detect_extensions


def test_detect_extensions() -> None:
assert _detect_extensions("foo.json") == ".json"
assert _detect_extensions("foo.json.gz") == ".json"
assert _detect_extensions("foo.jsonl") == ".jsonl"
assert _detect_extensions("foo.jsonl.gz") == ".jsonl"
assert _detect_extensions("/something/else/foo.sqlite") == ".sqlite"
assert _detect_extensions("/something/else/foo.jsonl.gz") == ".jsonl"
assert _detect_extensions("/something/else/foo.jsonl.zstd") == ".jsonl"
assert _detect_extensions("/something/else/foo.jsonl.xz") == ".jsonl"

0 comments on commit beae3c9

Please sign in to comment.