From d7c250bdac3f2797846495296ead5ef7c794c74e Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Fri, 22 Sep 2023 20:51:58 -0700 Subject: [PATCH 01/27] spelling fixes --- google_takeout_parser/models.py | 2 +- google_takeout_parser/parse_html/activity.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py index 3072a7b..e332274 100644 --- a/google_takeout_parser/models.py +++ b/google_takeout_parser/models.py @@ -194,7 +194,7 @@ def key(self) -> Tuple[str, int]: return self.url, int(self.dt.timestamp()) -# cant compute this dynamically -- have to write it out +# can't compute this dynamically -- have to write it out # if you want to override, override both global variable types with new types DEFAULT_MODEL_TYPE = Union[ Activity, diff --git a/google_takeout_parser/parse_html/activity.py b/google_takeout_parser/parse_html/activity.py index 7d1dd7a..b2e2116 100644 --- a/google_takeout_parser/parse_html/activity.py +++ b/google_takeout_parser/parse_html/activity.py @@ -223,7 +223,7 @@ def _parse_caption( elif len(links) == 1: if _is_location_api_link(links[0]): url = links[0] - # wasnt set in partition above, was only one + # wasn't set in partition above, was only one # phrase of text if name is None: name = textbuf @@ -233,7 +233,7 @@ def _parse_caption( source = textbuf else: # no links, just a description of the source - # (since theres no URL, cant be name) + # (since there's no URL, can't be name) source = textbuf locationInfos.append( From 545d3055b2a1790e422ede1839ec0775abfc9e42 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Fri, 22 Sep 2023 21:11:27 -0700 Subject: [PATCH 02/27] fix some mypy errors --- google_takeout_parser/__main__.py | 4 ++-- google_takeout_parser/merge.py | 15 +++++++++++++-- tests/test_types.py | 2 +- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/google_takeout_parser/__main__.py b/google_takeout_parser/__main__.py index 1092740..395d545 100644 --- a/google_takeout_parser/__main__.py +++ b/google_takeout_parser/__main__.py @@ -106,9 +106,9 @@ def merge(cache: bool, action: str, takeout_dir: Sequence[str]) -> None: """ from .path_dispatch import TakeoutParser from .merge import cached_merge_takeouts, merge_events - from .models import DEFAULT_MODEL_TYPE + from .models import DEFAULT_MODEL_TYPE, Res - res: List[DEFAULT_MODEL_TYPE] = [] + res: List[Res[DEFAULT_MODEL_TYPE]] = [] if cache: res = list(cached_merge_takeouts(list(takeout_dir))) else: diff --git a/google_takeout_parser/merge.py b/google_takeout_parser/merge.py index 69c0b04..3a2b5bf 100644 --- a/google_takeout_parser/merge.py +++ b/google_takeout_parser/merge.py @@ -21,10 +21,21 @@ # else Im just duplicating code that would exist in HPI anyways +def _cache_path(_takeout_paths: List[PathIsh]) -> str: + """ + Cache path for the merged takeout + """ + return str(takeout_cache_path / "_merged_takeouts") + + +def _depends_on(pths: List[PathIsh]) -> str: + return str(list(sorted([str(p) for p in pths]))) + + # Note: only used for this module, HPI caches elsewhere @cachew( - cache_path=lambda _: str(takeout_cache_path / "_merged_takeouts"), - depends_on=lambda pths: list(sorted([str(p) for p in pths])), + cache_path=_cache_path, + depends_on=_depends_on, force_file=True, logger=logger, ) diff --git a/tests/test_types.py b/tests/test_types.py index 91b9187..9921ade 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -8,7 +8,7 @@ def test_check_union() -> None: """ Makes sure that any classes defined in models are included in the union type - sanity check test to ensure cachew doesnt fail with difficult to debug union/errors + sanity check test to ensure cachew doesn't fail with difficult to debug union/errors """ classes = { From 8aae314aaca74228d5c8a54d35257defffe5f176 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Fri, 29 Sep 2023 11:53:40 -0700 Subject: [PATCH 03/27] run CI on all branches --- .github/workflows/ci.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 63cc35e..b5faac0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -2,9 +2,9 @@ name: CI on: push: - branches: [master] + branches: ['*'] pull_request: - branches: [master] + branches: ['*'] jobs: build: From 3ff25601f0555bef9036846bc52878a0ebeefd59 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sat, 30 Sep 2023 12:54:28 -0700 Subject: [PATCH 04/27] use static path for merged takeouts --- google_takeout_parser/merge.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/google_takeout_parser/merge.py b/google_takeout_parser/merge.py index 3a2b5bf..a957ce0 100644 --- a/google_takeout_parser/merge.py +++ b/google_takeout_parser/merge.py @@ -21,21 +21,10 @@ # else Im just duplicating code that would exist in HPI anyways -def _cache_path(_takeout_paths: List[PathIsh]) -> str: - """ - Cache path for the merged takeout - """ - return str(takeout_cache_path / "_merged_takeouts") - - -def _depends_on(pths: List[PathIsh]) -> str: - return str(list(sorted([str(p) for p in pths]))) - - # Note: only used for this module, HPI caches elsewhere @cachew( - cache_path=_cache_path, - depends_on=_depends_on, + cache_path=str(takeout_cache_path / "_merged_takeouts"), + depends_on=lambda tp: str([str(p) for p in tp]), force_file=True, logger=logger, ) From 86a7a9f109409cb3b46ba85d06c2aa37689dd050 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sat, 30 Sep 2023 12:56:29 -0700 Subject: [PATCH 05/27] sort like before to maintain depends_on --- google_takeout_parser/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google_takeout_parser/merge.py b/google_takeout_parser/merge.py index a957ce0..d2f05f6 100644 --- a/google_takeout_parser/merge.py +++ b/google_takeout_parser/merge.py @@ -24,7 +24,7 @@ # Note: only used for this module, HPI caches elsewhere @cachew( cache_path=str(takeout_cache_path / "_merged_takeouts"), - depends_on=lambda tp: str([str(p) for p in tp]), + depends_on=lambda tp: str(list(sorted(str(p) for p in tp))), force_file=True, logger=logger, ) From 9aab653e97a5a6602c402c6a7f14e7f9621ef1fd Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sat, 30 Sep 2023 13:02:26 -0700 Subject: [PATCH 06/27] fix path for new cachew, use setup.cfg --- setup.cfg | 67 +++++++++++++++++++++++++++++++++++++++++---- setup.py | 54 ++---------------------------------- tests/test_types.py | 2 +- 3 files changed, 65 insertions(+), 58 deletions(-) diff --git a/setup.cfg b/setup.cfg index 090b7e6..83f9a38 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,10 +1,66 @@ +[metadata] +name = google_takeout_parser +version = 0.1.3 +description = Parses data out of your Google Takeout (History, Activity, Youtube, Locations, etc...) +long_description = file: README.md +long_description_content_type = text/markdown +url = https://github.com/seanbreckenridge/google_takeout_parser +author = Sean Breckenridge +author_email = "seanbrecke@gmail.com" +license = MIT +license_files = LICENSE +classifiers = + License :: OSI Approved :: MIT License + Programming Language :: Python + Programming Language :: Python :: 3 + Programming Language :: Python :: 3 :: Only + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 +keywords = google data parsing + +[options] +packages = find: +install_requires = + IPython + beautifulsoup4>=4.9.0 + cachew>=0.14.20230922 + click>=8.1 + logzero>=1.7.0 + lxml>=4.6.0 + platformdirs>=2.3.0 + pytz>=2021.3 +python_requires = >=3.8 +include_package_data = True + +[options.packages.find] +exclude = + tests* +include = + google_takeout_parser + google_takeout_parser.parse_html + +[options.entry_points] +console_scripts = + google_takeout_parser = google_takeout_parser.__main__:main + +[options.extras_require] +testing = + flake8 + mypy + pytest + +[options.package_data] +google_takeout_parser = py.typed + [flake8] -ignore=E501,E402,W503,E266,E203 +ignore = E501,E402,W503,E266,E203 [mypy] pretty = True show_error_context = True -show_error_codes = True +show_error_codes = True check_untyped_defs = True namespace_packages = True disallow_any_generics = True @@ -19,7 +75,6 @@ warn_unreachable = True [tool:pytest] addopts = - --doctest-modules google_takeout_parser - -vv - ./tests/ - + --doctest-modules google_takeout_parser + -vv + ./tests/ diff --git a/setup.py b/setup.py index d459510..7f1a176 100644 --- a/setup.py +++ b/setup.py @@ -1,52 +1,4 @@ -from pathlib import Path -from setuptools import setup, find_packages +from setuptools import setup -long_description = Path("README.md").read_text() -reqs = Path("requirements.txt").read_text().strip().splitlines() - -pkg = "google_takeout_parser" -setup( - name=pkg, - version="0.1.3", - url="https://github.com/seanbreckenridge/google_takeout_parser", - author="Sean Breckenridge", - author_email="seanbrecke@gmail.com", - description=( - """Parses data out of your Google Takeout (History, Activity, Youtube, Locations, etc...)""" - ), - long_description=long_description, - long_description_content_type="text/markdown", - license="MIT", - packages=find_packages( - include=["google_takeout_parser", "google_takeout_parser.parse_html"] - ), - install_requires=reqs, - package_data={pkg: ["py.typed"]}, - zip_safe=False, - keywords="google data parsing", - python_requires=">=3.7", - entry_points={ - "console_scripts": [ - "google_takeout_parser = google_takeout_parser.__main__:main" - ] - }, - extras_require={ - "testing": [ - "pytest", - "mypy", - "flake8", - ], - ':python_version<"3.7"': [ - "typing_extensions", - ], - }, - classifiers=[ - "License :: OSI Approved :: MIT License", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - ], -) +if __name__ == "__main__": + setup() diff --git a/tests/test_types.py b/tests/test_types.py index 9921ade..6ba0acd 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -1,7 +1,7 @@ import inspect import google_takeout_parser.models as mod -from cachew import get_union_args +from cachew.legacy import get_union_args def test_check_union() -> None: From 6a6abfaa0b31738b62052fb950de764afe524aa7 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sat, 30 Sep 2023 13:03:59 -0700 Subject: [PATCH 07/27] update ci versions --- .github/workflows/ci.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b5faac0..0a395b6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -11,10 +11,10 @@ jobs: strategy: matrix: platform: [ubuntu-latest, windows-latest] - python-version: [3.7, 3.8, 3.9, "3.10"] + python-version: [3.8, 3.9, "3.10", "3.11"] exclude: [ - {platform: windows-latest, python-version: "3.8"}, - {platform: windows-latest, python-version: "3.9"} + {platform: windows-latest, python-version: "3.9"}, + {platform: windows-latest, python-version: "3.10"} ] runs-on: ${{ matrix.platform }} From c62965c37f87e3fb5192bb9e0dc25acf7b69f0fb Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sat, 30 Sep 2023 13:07:06 -0700 Subject: [PATCH 08/27] v0.1.4: drop python3.7 support,bump cachew version --- README.md | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5b15508..35024c1 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ This was extracted out of [my HPI](https://github.com/seanbreckenridge/HPI/tree/ ## Installation -Requires `python3.7+` +Requires `python3.8+` To install with pip, run: diff --git a/setup.cfg b/setup.cfg index 83f9a38..b2bd51d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = google_takeout_parser -version = 0.1.3 +version = 0.1.4 description = Parses data out of your Google Takeout (History, Activity, Youtube, Locations, etc...) long_description = file: README.md long_description_content_type = text/markdown From 17b2a23b71c01861938f344484b6b8d63091e225 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sat, 30 Sep 2023 14:27:36 -0700 Subject: [PATCH 09/27] dynamically compute return_type --- README.md | 1 - google_takeout_parser/__init__.py | 14 ++- google_takeout_parser/compat.py | 8 -- google_takeout_parser/models.py | 14 +-- google_takeout_parser/parse_html/activity.py | 3 - google_takeout_parser/parse_html/comment.py | 3 - google_takeout_parser/parse_json.py | 19 +---- google_takeout_parser/path_dispatch.py | 89 ++++++++++++++------ tests/test_json.py | 4 +- tests/test_types.py | 31 +++++++ 10 files changed, 107 insertions(+), 79 deletions(-) delete mode 100644 google_takeout_parser/compat.py diff --git a/README.md b/README.md index 35024c1..b7fb21a 100644 --- a/README.md +++ b/README.md @@ -174,7 +174,6 @@ Just to give a brief overview, to add new functionality (parsing some new folder - Add a `model` for it in [`models.py`](google_takeout_parser/models.py) subclassing `BaseEvent` and adding it to the Union at the bottom of the file. That should have a `key` property function which describes each event uniquely (used to merge takeout events) - Write a function which takes the `Path` to the file you're trying to parse and converts it to the model you created (See examples in [`parse_json.py`](google_takeout_parser/parse_json.py)). Ideally extract a single raw item from the takeout file add a test for it so its obvious when/if the format changes. -- Set [the `return_type`](https://github.com/seanbreckenridge/google_takeout_parser/blob/7b1ee8ec3c3f36e6f279f20a9a214b6a3e8775f5/google_takeout_parser/parse_json.py#L71) property on the function, to use for caching/filtering - Add a regex match for the file path to the [`DEFAULT_HANDLER_MAP`](https://github.com/seanbreckenridge/google_takeout_parser/blob/2bd64b7373e4a2ac2ace32e03b25ca3b7e901034/google_takeout_parser/path_dispatch.py#L48) ### Testing diff --git a/google_takeout_parser/__init__.py b/google_takeout_parser/__init__.py index 5c05cf6..86adc69 100644 --- a/google_takeout_parser/__init__.py +++ b/google_takeout_parser/__init__.py @@ -1,10 +1,6 @@ -from pkg_resources import get_distribution, DistributionNotFound +import importlib.metadata -try: - # Change here if project is renamed and does not equal the package name - dist_name = __name__ - __version__ = get_distribution(dist_name).version -except DistributionNotFound: - __version__ = "unknown" -finally: - del get_distribution, DistributionNotFound +# Change here if project is renamed and does not equal the package name +__version__ = importlib.metadata.version(__name__) + +del importlib diff --git a/google_takeout_parser/compat.py b/google_takeout_parser/compat.py deleted file mode 100644 index 2c7eb03..0000000 --- a/google_takeout_parser/compat.py +++ /dev/null @@ -1,8 +0,0 @@ -import sys - -# from https://github.com/karlicoss/HPI/blob/master/my/core/compat.py - -if sys.version_info[:2] >= (3, 8): - from typing import Literal -else: - from typing_extensions import Literal # noqa: F401 diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py index e332274..58ddd6d 100644 --- a/google_takeout_parser/models.py +++ b/google_takeout_parser/models.py @@ -7,7 +7,7 @@ from __future__ import annotations from datetime import datetime -from typing import Optional, List, Tuple, Any, Union, Iterator, TYPE_CHECKING, Dict +from typing import Optional, List, Tuple, Any, Union, Iterator, Dict, Protocol from dataclasses import dataclass from .common import Res @@ -26,14 +26,6 @@ # name, url Subtitles = Tuple[str, MetaData] -if TYPE_CHECKING: - try: - from typing import Protocol - except ImportError: - from typing_extensions import Protocol # type: ignore -else: - Protocol = object - class BaseEvent(Protocol): @property @@ -107,11 +99,11 @@ def key(self) -> int: class Location(BaseEvent): lat: float lng: float - accuracy: Optional[int] + accuracy: Optional[float] dt: datetime @property - def key(self) -> Tuple[float, float, Optional[int], int]: + def key(self) -> Tuple[float, float, Optional[float], int]: return self.lat, self.lng, self.accuracy, int(self.dt.timestamp()) diff --git a/google_takeout_parser/parse_html/activity.py b/google_takeout_parser/parse_html/activity.py index b2e2116..274cbca 100644 --- a/google_takeout_parser/parse_html/activity.py +++ b/google_takeout_parser/parse_html/activity.py @@ -337,6 +337,3 @@ def _parse_html_activity(p: Path) -> Iterator[Res[Activity]]: yield _parse_activity_div(outer_div, file_dt=file_dt) except Exception as ae: yield ae - - -_parse_html_activity.return_type = Activity # type: ignore[attr-defined] diff --git a/google_takeout_parser/parse_html/comment.py b/google_takeout_parser/parse_html/comment.py index 0a591e6..a3e2a28 100644 --- a/google_takeout_parser/parse_html/comment.py +++ b/google_takeout_parser/parse_html/comment.py @@ -60,9 +60,6 @@ def _parse_html_comment_file(p: Path) -> Iterator[Res[YoutubeComment]]: yield e -_parse_html_comment_file.return_type = YoutubeComment # type: ignore[attr-defined] - - def test_parse_html_comment_file() -> None: li_obj = bs4.BeautifulSoup( """
  • Sent at 2020-04-27 23:18:23 UTC while watching a video.
    content here
""", diff --git a/google_takeout_parser/parse_json.py b/google_takeout_parser/parse_json.py index 6f30a03..af011a5 100644 --- a/google_takeout_parser/parse_json.py +++ b/google_takeout_parser/parse_json.py @@ -70,9 +70,6 @@ def _parse_json_activity(p: Path) -> Iterator[Res[Activity]]: yield e -_parse_json_activity.return_type = Activity # type: ignore[attr-defined] - - def _parse_likes(p: Path) -> Iterator[Res[LikedYoutubeVideo]]: json_data = json.loads(p.read_text()) if not isinstance(json_data, list): @@ -91,9 +88,6 @@ def _parse_likes(p: Path) -> Iterator[Res[LikedYoutubeVideo]]: yield e -_parse_likes.return_type = LikedYoutubeVideo # type: ignore[attr-defined] - - def _parse_app_installs(p: Path) -> Iterator[Res[PlayStoreAppInstall]]: json_data = json.loads(p.read_text()) if not isinstance(json_data, list): @@ -109,9 +103,6 @@ def _parse_app_installs(p: Path) -> Iterator[Res[PlayStoreAppInstall]]: yield e -_parse_app_installs.return_type = PlayStoreAppInstall # type: ignore[attr-defined] - - def _parse_timestamp_key(d: Dict[str, Any], key: str) -> datetime: if f"{key}Ms" in d: return parse_datetime_millis(d[f"{key}Ms"]) @@ -137,14 +128,12 @@ def _parse_location_history(p: Path) -> Iterator[Res[Location]]: lng=float(loc["longitudeE7"]) / 1e7, lat=float(loc["latitudeE7"]) / 1e7, dt=_parse_location_timestamp(loc), - accuracy=None if accuracy is None else int(accuracy), + accuracy=None if accuracy is None else float(accuracy), ) except Exception as e: yield e -_parse_location_history.return_type = Location # type: ignore[attr-defined] - _sem_required_keys = ["location", "duration"] @@ -209,9 +198,6 @@ def _parse_semantic_location_history(p: Path) -> Iterator[Res[PlaceVisit]]: yield e -_parse_semantic_location_history.return_type = PlaceVisit # type: ignore[attr-defined] - - def _parse_chrome_history(p: Path) -> Iterator[Res[ChromeHistory]]: json_data = json.loads(p.read_text()) if "Browser History" not in json_data: @@ -226,6 +212,3 @@ def _parse_chrome_history(p: Path) -> Iterator[Res[ChromeHistory]]: ) except Exception as e: yield e - - -_parse_chrome_history.return_type = ChromeHistory # type: ignore[attr-defined] diff --git a/google_takeout_parser/path_dispatch.py b/google_takeout_parser/path_dispatch.py index a60526b..4da5de5 100644 --- a/google_takeout_parser/path_dispatch.py +++ b/google_takeout_parser/path_dispatch.py @@ -9,13 +9,14 @@ from typing import ( Iterator, Dict, + Union, Callable, Any, Optional, List, Type, Tuple, - cast, + Literal, ) from collections import defaultdict @@ -23,7 +24,6 @@ from cachew import cachew from . import __version__ as _google_takeout_version -from .compat import Literal from .common import Res, PathIsh from .cache import takeout_cache_path from .log import logger @@ -47,22 +47,68 @@ HandlerFunction = Callable[[Path], BaseResults] HandlerMap = Dict[str, Optional[HandlerFunction]] -_CacheKeySingle = Type[BaseEvent] -CacheKey = _CacheKeySingle +CacheKey = Tuple[Type[BaseEvent], ...] def _cache_key_to_str(c: CacheKey) -> str: - return str(c.__name__).casefold() + """Convert a cache key to a string""" + return "_".join(sorted(p.__name__ for p in c)).casefold() -def _parse_handler_return_type(handler: HandlerFunction) -> CacheKey: - assert hasattr( - handler, "return_type" - ), f"Handler functions should have an 'return_type' property which specifies what types this produces. See parse_json.py for an example. No 'return_type' on {handler}" - val: Any = getattr(handler, "return_type") - assert isinstance(val, type), f"{val} is not a type" - assert BaseEvent in val.__mro__, f"{val} not a subclass of BaseEvent" - return cast(_CacheKeySingle, val) +def _handler_type_cache_key(handler: HandlerFunction) -> CacheKey: + # Take a function like Iterator[Union[Item, Exception]] and return Item + + import inspect + from cachew.legacy import get_union_args + + sig = inspect.signature(handler) + + # get the return type of the function + # e.g. Iterator[Union[Item, Exception]] + return_type = sig.return_annotation + + # this must have a return type + if return_type == inspect.Signature.empty: + raise TypeError(f"Could not get return type for {handler.__name__}") + + # remove top-level iterator if it has it + if return_type._name == "Iterator": + return_type = return_type.__args__[0] + + args: Optional[Tuple[Type]] = get_union_args(return_type) # type: ignore[type-arg] + if args is None: + raise TypeError( + f"Could not get union args for {return_type} in {handler.__name__}" + ) + + # remove exceptions + t_args = tuple(t for t in args if t != Exception) + + for t in t_args: + if BaseEvent not in t.__mro__: + raise TypeError( + f"Return type {t} from {return_type} of {handler.__name__} does not contain BaseEvent" + ) + if t == BaseEvent: + raise TypeError( + f"Return type {t} from {return_type} of {handler.__name__} is BaseEvent, which is not allowed" + ) + + return tuple(t_args) + + +def _cache_key_to_type(c: CacheKey) -> Any: + """ + If theres one item in the cache key, return that + If theres multiple, return a Union of them + """ + assert len(c) > 0 + if len(c) == 1: + return c[0] + else: + assert isinstance(c, tuple) + + return Union[c] # type: ignore[valid-type] # If parsed, should mention: @@ -285,7 +331,7 @@ def _log_handler(self, path: Path, handler: Any) -> None: def _parse_raw(self, filter_type: Optional[Type[BaseEvent]] = None) -> BaseResults: """Parse the takeout with no cache. If a filter is specified, only parses those files""" handlers = self._group_by_return_type(filter_type=filter_type) - for cache_key, result_tuples in handlers.items(): + for _, result_tuples in handlers.items(): for path, itr in result_tuples: self._log_handler(path, itr) yield from itr @@ -339,9 +385,9 @@ def _group_by_return_type( """ handlers: Dict[CacheKey, List[Tuple[Path, BaseResults]]] = defaultdict(list) for path, handler in self.dispatch_map().items(): - ckey: CacheKey = _parse_handler_return_type(handler) + ckey: CacheKey = _handler_type_cache_key(handler) # don't include in the result if we're filtering to a specific type - if filter_type is not None and ckey != filter_type: + if filter_type is not None and filter_type not in ckey: logger.debug( f"Provided '{filter_type}' as filter, '{ckey}' doesn't match, ignoring '{path}'..." ) @@ -381,14 +427,9 @@ def _cached_parse( ) -> BaseResults: handlers = self._group_by_return_type(filter_type=filter_type) for cache_key, result_tuples in handlers.items(): - # Hmm -- I think this should work with CacheKeys that have multiple - # types but it may fail -- need to check if one is added - # - # create a function which groups the iterators for this return type - # that all gets stored in one database - # - # the return type here is purely for cachew, so it can infer the type - def _func() -> Iterator[Res[cache_key]]: # type: ignore[valid-type] + _ret_type: Any = _cache_key_to_type(cache_key) + + def _func() -> Iterator[Res[_ret_type]]: # type: ignore[valid-type] for path, itr in result_tuples: self._log_handler(path, itr) yield from itr diff --git a/tests/test_json.py b/tests/test_json.py index 80bf494..5d1bbd1 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -110,7 +110,7 @@ def test_location_old(tmp_path_f) -> None: dt=datetime.datetime( 2017, 12, 10, 23, 14, 58, tzinfo=datetime.timezone.utc ), - accuracy=10, + accuracy=10.0, ), ] @@ -127,7 +127,7 @@ def test_location_new(tmp_path_f: Path) -> None: dt=datetime.datetime( 2017, 12, 10, 23, 14, 58, 30000, tzinfo=datetime.timezone.utc ), - accuracy=10, + accuracy=10.0, ), ] diff --git a/tests/test_types.py b/tests/test_types.py index 6ba0acd..696ef73 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -21,3 +21,34 @@ def test_check_union() -> None: union_args = set(ua) assert union_args == classes + + +def test_parsing_return_type() -> None: + from typing import Iterator, Union + from pathlib import Path + from google_takeout_parser.path_dispatch import ( + _cache_key_to_str, + _cache_key_to_type, + _handler_type_cache_key, + ) + from google_takeout_parser.models import Activity, Res, PlayStoreAppInstall + + def _test_func(path: Path) -> Iterator[Res[Activity]]: + yield Exception("test") + + ret_type = _handler_type_cache_key(_test_func) + assert ret_type is not None + assert ret_type == (Activity,) + assert _cache_key_to_str(ret_type) == "activity" + assert _cache_key_to_type(ret_type) == Activity + + def _test_multiple( + path: Path, + ) -> Iterator[Res[Union[Activity, PlayStoreAppInstall]]]: + yield Exception("test") + + ret_type = _handler_type_cache_key(_test_multiple) + assert ret_type is not None + assert ret_type == (Activity, PlayStoreAppInstall) + assert _cache_key_to_str(ret_type) == "activity_playstoreappinstall" + assert _cache_key_to_type(ret_type) == Union[Activity, PlayStoreAppInstall] From b42259cbc18c7b52a24247ec8ad75fc3fe20c631 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sat, 30 Sep 2023 14:47:59 -0700 Subject: [PATCH 10/27] use hypens in readme install --- README.md | 2 +- google_takeout_parser/path_dispatch.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b7fb21a..b1a16a9 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ Requires `python3.8+` To install with pip, run: - pip install google_takeout_parser + pip install google-takeout-parser ## Usage diff --git a/google_takeout_parser/path_dispatch.py b/google_takeout_parser/path_dispatch.py index 4da5de5..064037e 100644 --- a/google_takeout_parser/path_dispatch.py +++ b/google_takeout_parser/path_dispatch.py @@ -99,8 +99,8 @@ def _handler_type_cache_key(handler: HandlerFunction) -> CacheKey: def _cache_key_to_type(c: CacheKey) -> Any: """ - If theres one item in the cache key, return that - If theres multiple, return a Union of them + If there's one item in the cache key, return that + If there's multiple, return a Union of them """ assert len(c) > 0 if len(c) == 1: From 0f21d2d7d3883972334fede68963fb11a40c7299 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sun, 1 Oct 2023 01:05:13 -0700 Subject: [PATCH 11/27] add go script to split large html files --- README.md | 7 + google_takeout_parser/path_dispatch.py | 5 +- split_html/.gitignore | 32 ++++ split_html/README.md | 47 +++++ split_html/go.mod | 5 + split_html/go.sum | 2 + split_html/split_html_activity.go | 227 +++++++++++++++++++++++++ tests/test_split_html.py | 74 ++++++++ 8 files changed, 397 insertions(+), 2 deletions(-) create mode 100644 split_html/.gitignore create mode 100644 split_html/README.md create mode 100644 split_html/go.mod create mode 100644 split_html/go.sum create mode 100644 split_html/split_html_activity.go create mode 100644 tests/test_split_html.py diff --git a/README.md b/README.md index b1a16a9..20a8c0d 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ Parses data out of your [Google Takeout](https://takeout.google.com/) (History, - [Usage](#usage) - [CLI Usage](#cli-usage) - [Library Usage](#library-usage) +- [Legacy HTML Parsing](#legacy-html-parsing) - [Contributing](#contributing) - [Testing](#testing) @@ -168,6 +169,12 @@ len(locations) I personally exclusively use this through the [HPI google takeout](https://github.com/karlicoss/HPI/blob/master/my/google/takeout/parser.py) file, as a configuration layer to locate where my takeouts are on disk, and since that 'automatically' unzips the takeouts (I store them as the zips), i.e., doesn't require me to maintain an unpacked view +### Legacy HTML Parsing + +I would *heavily recommend against* using the HTML format for `My Activity`. It is not always possible to properly parse the metadata, is more prone to errors parsing dates due to local timezones, and takes much longer to parse than the JSON output. + +On certain machines, the giant HTML files may even take so much memory that the process is eventually killed for using too much memory. For a workaround, see [split_html](./split_html). + ### Contributing Just to give a brief overview, to add new functionality (parsing some new folder that this doesn't currently support), you'd need to: diff --git a/google_takeout_parser/path_dispatch.py b/google_takeout_parser/path_dispatch.py index 064037e..43db216 100644 --- a/google_takeout_parser/path_dispatch.py +++ b/google_takeout_parser/path_dispatch.py @@ -164,7 +164,8 @@ def _cache_key_to_type(c: CacheKey) -> Any: r"My Activity/Voice and Audio/.*.mp3": None, r"My Activity/Takeout": None, # activity for when you made takeouts, dont need # HTML 'My Activity' Files - r"My Activity/.*?My\s*Activity.html": _parse_html_activity, + # the \d+ is for split html files, see the ./split_html directory + r"My Activity/.*?My\s*Activity(-\d+)?.html": _parse_html_activity, r"My Activity/.*?My\s*Activity.json": _parse_json_activity, # Maybe parse these? r"Access Log Activity": None, @@ -283,7 +284,7 @@ def _match_handler(p: Path, handler: HandlerMap) -> HandlerMatch: # TODO: cache? may run into issues though def dispatch_map(self) -> Dict[Path, HandlerFunction]: res: Dict[Path, HandlerFunction] = {} - for f in self.takeout_dir.rglob("*"): + for f in sorted(self.takeout_dir.rglob("*")): if f.name.startswith("."): continue if not f.is_file(): diff --git a/split_html/.gitignore b/split_html/.gitignore new file mode 100644 index 0000000..4bffd80 --- /dev/null +++ b/split_html/.gitignore @@ -0,0 +1,32 @@ +*.html +scripts +split_html +split_html_activity +# Created by https://www.toptal.com/developers/gitignore/api/go +# Edit at https://www.toptal.com/developers/gitignore?templates=go + +### Go ### +# If you prefer the allow list template instead of the deny list, see community template: +# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore +# +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, built with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +# Dependency directories (remove the comment below to include it) +# vendor/ + +# Go workspace file +go.work + +# End of https://www.toptal.com/developers/gitignore/api/go + diff --git a/split_html/README.md b/split_html/README.md new file mode 100644 index 0000000..61f063b --- /dev/null +++ b/split_html/README.md @@ -0,0 +1,47 @@ +### NOTE: This is for splitting the old HTML format, you shouldn't use this for new exports + +This dir contains a go script to split an HTML file into smaller chunks, so its possible to parse on machines with limited memory. + +In particular, I had issues using this with [termux](https://termux.dev/en/) on my phone, as the ~100MB takeout HTML files when parsed by loading the whole file into memory cause my terminal to just crash since it runs out of memory + +So, this script splits the HTML files into lots of smaller chunks, like: + +``` +MyActivity-001.html +MyActivity-002.html +MyActivity-003.html +MyActivity-004.html +MyActivity-005.html +MyActivity-006.html +``` + +To build: `go build -o split_html` + +``` +Usage: ./split_html [options] input + -count int + how many cells to split into each file (default 1000) + -output string + output directory. if not specified, will use the directory of the input file +``` + +Then, use it against any large files that you have problems parsing: + +```bash +./split_html ~/data/takeout/something/MyActivity/YouTube/MyActivity.html +# move other file somewhere else +mv ~/data/takeout/something/MyActivity/Youtube/MyActivity.html /tmp +# test parsing to make sure they still work +google_takeout_parser merge -a summary ~/data/takeout/something +``` + +This splits the files into dozens of files about `~700K` instead of the giant HTML files + +I personally **created copies** of all of my HTML exports, and did: + +``` +find ~/Downloads/takeout/ -name 'MyActivity.html' -exec ./split_html "{}" \; +find ~/Downloads/takeout/ -name 'MyActivity.html' -delete +``` + +And then used `google_takeout_parser merge -a summary` to compare the new and old outputs before removing the old files diff --git a/split_html/go.mod b/split_html/go.mod new file mode 100644 index 0000000..2c836b8 --- /dev/null +++ b/split_html/go.mod @@ -0,0 +1,5 @@ +module github.com/seanbreckenridge/google_takeout_parser/scripts + +go 1.18.0 + +require golang.org/x/net v0.15.0 diff --git a/split_html/go.sum b/split_html/go.sum new file mode 100644 index 0000000..9746d02 --- /dev/null +++ b/split_html/go.sum @@ -0,0 +1,2 @@ +golang.org/x/net v0.15.0 h1:ugBLEUaxABaB5AJqW9enI0ACdci2RUd4eP51NTBvuJ8= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= diff --git a/split_html/split_html_activity.go b/split_html/split_html_activity.go new file mode 100644 index 0000000..033e5e0 --- /dev/null +++ b/split_html/split_html_activity.go @@ -0,0 +1,227 @@ +package main + +import ( + "bytes" + "flag" + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "golang.org/x/net/html" +) + +type Flags struct { + input string + cellCount int + outputDir string +} + +func parseFlags() (*Flags, error) { + var input string + var outputDir string + var cellCount int + flag.IntVar(&cellCount, "count", 1000, "how many cells to split into each file") + flag.StringVar(&outputDir, "output", "", "output directory. if not specified, will use the directory of the input file") + flag.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: %s [options] input\n", os.Args[0]) + flag.PrintDefaults() + } + flag.Parse() + if flag.NArg() == 0 { + flag.Usage() + fmt.Printf("an input file is required\n") + os.Exit(1) + } + input = flag.Arg(0) + + info, err := os.Stat(input) + if os.IsNotExist(err) { + return nil, fmt.Errorf("input file '%s' does not exist", input) + } + + if info.IsDir() { + return nil, fmt.Errorf("input file '%s' is a directory", input) + } + + absPath, err := filepath.Abs(input) + if err != nil { + return nil, err + } + + if outputDir == "" { + outputDir = filepath.Dir(absPath) + } + + if cellCount < 1 { + return nil, fmt.Errorf("cell count must be greater than 0") + } + + return &Flags{input: input, cellCount: cellCount, outputDir: outputDir}, nil +} + +// Function to check if a token has a specific class +func hasClass(token html.Token, class string) bool { + for _, attr := range token.Attr { + if attr.Key == "class" && strings.Contains(attr.Val, class) { + return true + } + } + return false +} + +func readAndWriteToPartials(flags *Flags) error { + input, err := os.Open(flags.input) + if err != nil { + return err + } + defer input.Close() + + var outputFile *os.File + var outputFileName string + currentFile := 1 + + openOutputFile := func() error { + // if we have an open file, close it and increment the current file + if outputFile != nil { + outputFile.Close() + currentFile++ + } + outputFileName = filepath.Join(flags.outputDir, fmt.Sprintf("MyActivity-%04d.html", currentFile)) + var err error + outputFile, err = os.Create(outputFileName) + if err != nil { + return err + } + return nil + } + + err = openOutputFile() + if err != nil { + return err + } + defer outputFile.Close() + + writtenCount := 0 + + writeBuffer := func(data []byte) error { + // if we've written enough, close the file and open a new one + if writtenCount >= flags.cellCount { + ferr := openOutputFile() + if ferr != nil { + return ferr + } + writtenCount = 0 + if outputFile == nil { + return fmt.Errorf("output file is not open") + } + } + // note: panics if outputFile is nil, but that should never happen + _, err := outputFile.Write(data) + return err + } + + z := html.NewTokenizer(input) + // find the div.outer-cell and 'start' a block there, + // copying all tokens till we end that block + // need to keep track of divDepth, so we know when we're done + divDepth := 0 + inBlock := false + var blockContent bytes.Buffer + + blockContent = bytes.Buffer{} + + for { + tt := z.Next() + switch tt { + case html.ErrorToken: + if z.Err() == io.EOF { + // done, defers should cleanup the rest + if len(blockContent.String()) > 0 { + return fmt.Errorf("found EOF, but block content is not empty") + } + return nil + } else { + return z.Err() + } + case html.DoctypeToken, html.CommentToken: + // skip these + case html.StartTagToken: + t := z.Token() + if t.Data == "div" && hasClass(t, "outer-cell") { + if inBlock { + return fmt.Errorf("found start tag for outer-cell, but we're already in a block") + } + inBlock = true + } + // if were in the block, write any start tags to the block content + if inBlock { + blockContent.Write(z.Raw()) + if t.Data == "div" { + divDepth++ + } + } + + case html.EndTagToken: + t := z.Token() + // if we're ending a tag for outer-cell, we would be here at depth 1 + // this means we're done with the block + if inBlock && divDepth == 1 { + // we're done with the block, write it out + // and reset the block content + + // add the end tag to the block content + blockContent.Write(z.Raw()) + blockContent.Write([]byte("\n")) + + // write to file + writeBuffer(blockContent.Bytes()) + writtenCount++ + + // reset the block content + blockContent.Reset() + inBlock = false + divDepth = 0 + } + + // otherwise, if we're in a block, add the end tag to the block content + if inBlock { + if t.Data == "div" { + divDepth-- + } + blockContent.Write(z.Raw()) + } + + case html.SelfClosingTagToken, html.TextToken: + // if we're in a block, add data to the buffer + if inBlock { + blockContent.Write(z.Raw()) + } + default: + return fmt.Errorf("unknown token type: %v", tt) + } + } +} + +func splitHtmlActivity() error { + flags, err := parseFlags() + if err != nil { + return err + } + + rerr := readAndWriteToPartials(flags) + if rerr != nil { + return rerr + } + + return nil +} + +func main() { + err := splitHtmlActivity() + if err != nil { + fmt.Println(err) + os.Exit(1) + } +} diff --git a/tests/test_split_html.py b/tests/test_split_html.py new file mode 100644 index 0000000..eadf07d --- /dev/null +++ b/tests/test_split_html.py @@ -0,0 +1,74 @@ +import os +import tempfile +import subprocess +from typing import Generator +from pathlib import Path + +import pytest + +from .common import this_dir +from google_takeout_parser.parse_html.activity import _parse_html_activity + + +activity_html_file = str( + Path("~/.cache/gt/Takeout-Old/My Activity/YouTube/MyActivity.html") + .expanduser() + .absolute() +) + +golang_dir = this_dir / ".." / "split_html" + + +@pytest.fixture +def in_golang_dir() -> Generator[None, None, None]: + current_dir = os.getcwd() + try: + os.chdir(golang_dir) + yield + finally: + os.chdir(current_dir) + + +@pytest.mark.skipif( + "TEST_GOLANG_SPLIT" not in os.environ, + reason="TEST_GOLANG_SPLIT not set, skipping test", +) +def test_split_html(in_golang_dir) -> None: + with tempfile.TemporaryDirectory() as temp_dir: + subprocess.run( + [ + "go", + "run", + golang_dir / "split_html_activity.go", + "-output", + temp_dir, + activity_html_file, + ], + check=True, + ) + + assert Path(temp_dir).is_dir() + + from_merged = [] + for file in Path(temp_dir).iterdir(): + assert file.is_file() + assert file.stat().st_size > 0 + + for x in _parse_html_activity(file): + if isinstance(x, Exception): + raise x + from_merged.append(x) + + from_original = [ + a + for a in _parse_html_activity(Path(activity_html_file)) + if not isinstance(a, Exception) + ] + + assert len(from_merged) == len(from_original) + + from_merged.sort(key=lambda x: x.time) + from_original.sort(key=lambda x: x.time) + + for a, b in zip(from_merged, from_original): + assert a == b From d2153460c6aab258d1ac480dbb34da872ebdd547 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sun, 1 Oct 2023 01:07:11 -0700 Subject: [PATCH 12/27] autoformat --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 20a8c0d..e7a6736 100644 --- a/README.md +++ b/README.md @@ -171,7 +171,7 @@ I personally exclusively use this through the [HPI google takeout](https://githu ### Legacy HTML Parsing -I would *heavily recommend against* using the HTML format for `My Activity`. It is not always possible to properly parse the metadata, is more prone to errors parsing dates due to local timezones, and takes much longer to parse than the JSON output. +I would _heavily recommend against_ using the HTML format for `My Activity`. It is not always possible to properly parse the metadata, is more prone to errors parsing dates due to local timezones, and takes much longer to parse than the JSON output. On certain machines, the giant HTML files may even take so much memory that the process is eventually killed for using too much memory. For a workaround, see [split_html](./split_html). From 4a2c90664df83b05e6751624813451b2ff48fff9 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sun, 1 Oct 2023 01:12:05 -0700 Subject: [PATCH 13/27] exclude split_html in package discovery --- setup.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index b2bd51d..d38caae 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,7 +36,8 @@ include_package_data = True [options.packages.find] exclude = - tests* + tests + split_html include = google_takeout_parser google_takeout_parser.parse_html From c27b11341cf6a663df9a02e5a5cba3c6178351b4 Mon Sep 17 00:00:00 2001 From: seanbreckenridge Date: Sun, 1 Oct 2023 01:27:03 -0700 Subject: [PATCH 14/27] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e7a6736..a581da4 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,8 @@ Since the Takeout slowly removes old events over time, I would recommend periodi - Select JSON as format - In options, deselect `music-library-songs`, `music-uploads` and `videos` +**Be sure to select JSON whenever possible**. Code to parse the HTML format is included here, but it is treated as legacy code and comes with worse performance and a myriad of other issues. See [legacy html parsing](#legacy-html-parsing) + The process for getting these isn't that great -- you have to manually go to [takeout.google.com](https://takeout.google.com) every few months, select what you want to export info for, and then it puts the zipped file into your google drive. You can tell it to run it at specific intervals, but I personally haven't found that to be that reliable. This currently parses: @@ -171,7 +173,7 @@ I personally exclusively use this through the [HPI google takeout](https://githu ### Legacy HTML Parsing -I would _heavily recommend against_ using the HTML format for `My Activity`. It is not always possible to properly parse the metadata, is more prone to errors parsing dates due to local timezones, and takes much longer to parse than the JSON output. +I would _heavily recommend against_ using the HTML format for `My Activity`. It is not always possible to properly parse the metadata, is more prone to errors parsing dates due to local timezones, and takes much longer to parse than the JSON format. On certain machines, the giant HTML files may even take so much memory that the process is eventually killed for using too much memory. For a workaround, see [split_html](./split_html). From 6365682bb7a0320eebbf875ed2cbbad810e69dcb Mon Sep 17 00:00:00 2001 From: seanbreckenridge Date: Sun, 1 Oct 2023 12:06:00 -0700 Subject: [PATCH 15/27] make internal types in activity namedtuples (#47) --- google_takeout_parser/models.py | 31 +++++++++++++------- google_takeout_parser/parse_html/activity.py | 22 +++++++------- google_takeout_parser/parse_json.py | 30 ++++++++++++------- split_html/README.md | 2 +- tests/test_split_html.py | 5 +++- 5 files changed, 55 insertions(+), 35 deletions(-) diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py index 58ddd6d..be1273b 100644 --- a/google_takeout_parser/models.py +++ b/google_takeout_parser/models.py @@ -7,24 +7,33 @@ from __future__ import annotations from datetime import datetime -from typing import Optional, List, Tuple, Any, Union, Iterator, Dict, Protocol +from typing import ( + Optional, + List, + Tuple, + Any, + Union, + Iterator, + Dict, + Protocol, + NamedTuple, +) from dataclasses import dataclass from .common import Res from .log import logger -Details = str +class Subtitles(NamedTuple): + name: str + url: Optional[str] -# because of https://github.com/karlicoss/cachew/issues/28, need -# to do these as tuples instead of NamedTuples -MetaData = Optional[str] -# name, url, source, sourceUrl -LocationInfo = Tuple[MetaData, MetaData, MetaData, MetaData] - -# name, url -Subtitles = Tuple[str, MetaData] +class LocationInfo(NamedTuple): + name: Optional[str] + url: Optional[str] + source: Optional[str] + sourceUrl: Optional[str] class BaseEvent(Protocol): @@ -44,7 +53,7 @@ class Activity(BaseEvent): # a description and a subtitle, so they end up as subtitles # more lines of text describing this subtitles: List[Subtitles] - details: List[Details] + details: List[str] locationInfos: List[LocationInfo] products: List[str] diff --git a/google_takeout_parser/parse_html/activity.py b/google_takeout_parser/parse_html/activity.py index 274cbca..a2657fb 100644 --- a/google_takeout_parser/parse_html/activity.py +++ b/google_takeout_parser/parse_html/activity.py @@ -11,7 +11,7 @@ import bs4 from bs4.element import Tag, PageElement -from ..models import Activity, Subtitles, Details, LocationInfo +from ..models import Activity, Subtitles, LocationInfo from ..common import Res from ..log import logger from .html_time_utils import parse_html_dt @@ -94,7 +94,7 @@ def _parse_subtitles( else: raise RuntimeError(f"Unexpected Type {tag} {type(tag)}") - parsed_subs.append((clean_latin1_chars(buf), url)) + parsed_subs.append(Subtitles(name=clean_latin1_chars(buf), url=url)) return parsed_subs, parse_html_dt(dt_raw, file_dt=file_dt) @@ -165,8 +165,8 @@ def _is_location_api_link(url: str) -> bool: def _parse_caption( cap_cell: bs4.element.Tag, -) -> Tuple[List[Details], List[LocationInfo], List[str]]: - details: List[Details] = [] +) -> Tuple[List[str], List[LocationInfo], List[str]]: + details: List[str] = [] locationInfos: List[LocationInfo] = [] products: List[str] = [] @@ -237,15 +237,15 @@ def _parse_caption( source = textbuf locationInfos.append( - ( - name, - url, - source, - sourceUrl, + LocationInfo( + name=name, + url=url, + source=source, + sourceUrl=sourceUrl, ) ) elif header == "Details:": - details.append(Details(clean_latin1_chars(str(value[0])).strip())) + details.append(str(clean_latin1_chars(str(value[0])).strip())) else: warnings.warn(f"Unexpected header in caption {header} {value}") @@ -266,7 +266,7 @@ def _parse_activity_div( # all possible data that this div could parse dtime: datetime subtitles: List[Subtitles] = [] # more lines of text describing this - details: List[Details] = [] + details: List[str] = [] locationInfos: List[LocationInfo] = [] products: List[str] = [] diff --git a/google_takeout_parser/parse_json.py b/google_takeout_parser/parse_json.py index af011a5..9fd4972 100644 --- a/google_takeout_parser/parse_json.py +++ b/google_takeout_parser/parse_json.py @@ -5,10 +5,12 @@ import json from pathlib import Path from datetime import datetime, timezone -from typing import Iterator, Any, Dict, Iterable, Optional +from typing import Iterator, Any, Dict, Iterable, Optional, List from .time_utils import parse_datetime_millis from .models import ( + Subtitles, + LocationInfo, Activity, LikedYoutubeVideo, ChromeHistory, @@ -30,12 +32,14 @@ def _parse_json_activity(p: Path) -> Iterator[Res[Activity]]: yield RuntimeError(f"Activity: Top level item in '{p}' isn't a list") for blob in json_data: try: - subtitles = [] + subtitles: List[Subtitles] = [] for s in blob.get("subtitles", []): - if s == {}: - # sometimes it's just empty ("My Activity/Assistant" data circa 2018) + if not isinstance(s, dict): continue - subtitles.append((s["name"], s.get("url"))) + # sometimes it's just empty ("My Activity/Assistant" data circa 2018) + if "name" not in s: + continue + subtitles.append(Subtitles(name=s["name"], url=s.get("url"))) # till at least 2017 old_format = "snippet" in blob @@ -54,13 +58,17 @@ def _parse_json_activity(p: Path) -> Iterator[Res[Activity]]: description=blob.get("description"), time=parse_json_utc_date(time_str), subtitles=subtitles, - details=[d["name"] for d in blob.get("details", [])], + details=[ + d["name"] + for d in blob.get("details", []) + if isinstance(d, dict) and "name" in d + ], locationInfos=[ - ( - locinfo.get("name"), - locinfo.get("url"), - locinfo.get("source"), - locinfo.get("sourceUrl"), + LocationInfo( + name=locinfo.get("name"), + url=locinfo.get("url"), + source=locinfo.get("source"), + sourceUrl=locinfo.get("sourceUrl"), ) for locinfo in blob.get("locationInfos", []) ], diff --git a/split_html/README.md b/split_html/README.md index 61f063b..791d90e 100644 --- a/split_html/README.md +++ b/split_html/README.md @@ -1,4 +1,4 @@ -### NOTE: This is for splitting the old HTML format, you shouldn't use this for new exports +## This is for splitting the old HTML format, you shouldn't use this for new exports This dir contains a go script to split an HTML file into smaller chunks, so its possible to parse on machines with limited memory. diff --git a/tests/test_split_html.py b/tests/test_split_html.py index eadf07d..8dedc08 100644 --- a/tests/test_split_html.py +++ b/tests/test_split_html.py @@ -50,7 +50,10 @@ def test_split_html(in_golang_dir) -> None: assert Path(temp_dir).is_dir() from_merged = [] - for file in Path(temp_dir).iterdir(): + + files = sorted(Path(temp_dir).iterdir()) + assert len(files) > 1 + for file in files: assert file.is_file() assert file.stat().st_size > 0 From 8d8965c6215c5ee753e60ac13eb5bb88ce42ac42 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sun, 1 Oct 2023 12:32:11 -0700 Subject: [PATCH 16/27] add filter flag to cli --- README.md | 14 ++++++ google_takeout_parser/__main__.py | 59 ++++++++++++++++++++++---- google_takeout_parser/models.py | 11 +++++ google_takeout_parser/path_dispatch.py | 3 +- tests/test_types.py | 2 +- 5 files changed, 77 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index a581da4..95daae2 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,20 @@ To install with pip, run: Can be accessed by either `google_takeout_parser` or `python -m google_takeout_parser`. Offers a basic interface to list/clear the cache directory, and/or parse/merge a takeout and interact with it in a REPL: +``` +Usage: google_takeout_parser parse [OPTIONS] TAKEOUT_DIR + + Parse a takeout directory takeout + +Options: + -f, --filter [Activity|LikedYoutubeVideo|PlayStoreAppInstall|Location|ChromeHistory|YoutubeComment|PlaceVisit] + Filter to only show events of this type + -a, --action [repl|summary|json] + What to do with the parsed result [default: repl] + --cache / --no-cache [default: no-cache] + -h, --help Show this message and exit. +``` + To clear the `cachew` cache: `google_takeout_parser cache_dir clear` A few examples of parsing takeouts: diff --git a/google_takeout_parser/__main__.py b/google_takeout_parser/__main__.py index 395d545..87e5ab2 100644 --- a/google_takeout_parser/__main__.py +++ b/google_takeout_parser/__main__.py @@ -2,12 +2,14 @@ import json from datetime import datetime, date import dataclasses -from typing import List, Optional, Callable, Sequence, Any +from typing import List, Optional, Callable, Sequence, Any, Dict, Type, Tuple import click -@click.group() +@click.group( + context_settings={"help_option_names": ["-h", "--help"], "max_content_width": 120} +) @click.option( "--verbose/--quiet", default=None, @@ -30,16 +32,36 @@ def main(verbose: Optional[bool]) -> None: log.logger = log.setup(level=logging.ERROR) +# use the union of types to determine the possible filters +from .models import DEFAULT_MODEL_TYPE, get_union_args + +model_types: Optional[Tuple[Type[DEFAULT_MODEL_TYPE]]] = get_union_args( + DEFAULT_MODEL_TYPE +) +assert model_types is not None + +FILTER_OPTIONS: Dict[str, Type[DEFAULT_MODEL_TYPE]] = { + t.__name__: t for t in model_types +} + SHARED = [ click.option("--cache/--no-cache", default=False, show_default=True), click.option( "-a", "--action", - type=click.Choice(["repl", "summary", "json"]), + type=click.Choice(["repl", "summary", "json"], case_sensitive=False), default="repl", help="What to do with the parsed result", show_default=True, ), + click.option( + "-f", + "--filter", + "filter_", + type=click.Choice(list(FILTER_OPTIONS.keys()), case_sensitive=False), + multiple=False, + help="Filter to only show events of this type", + ), ] @@ -83,36 +105,55 @@ def _handle_action(res: List[Any], action: str) -> None: @main.command(short_help="parse a takeout directory") @shared_options @click.argument("TAKEOUT_DIR", type=click.Path(exists=True), required=True) -def parse(cache: bool, action: str, takeout_dir: str) -> None: +def parse(cache: bool, action: str, takeout_dir: str, filter_: str) -> None: """ Parse a takeout directory takeout """ - from .common import Res - from .models import BaseEvent from .path_dispatch import TakeoutParser + from .log import logger tp = TakeoutParser(takeout_dir, error_policy="drop") # note: actually no exceptions since since they're dropped - res: List[Res[BaseEvent]] = list(tp.parse(cache=cache)) + if cache: + if filter_: + logger.warn( + "As it would otherwise re-compute every time, filtering happens after loading from cache" + ) + res = list(tp.parse(cache=True)) + if filter_: + filter_type = FILTER_OPTIONS[filter_] + res = [r for r in res if isinstance(r, filter_type)] + else: + res = list(tp.parse(cache=False, filter_type=FILTER_OPTIONS.get(filter_, None))) _handle_action(res, action) @main.command(short_help="merge multiple takeout directories") @shared_options @click.argument("TAKEOUT_DIR", type=click.Path(exists=True), nargs=-1, required=True) -def merge(cache: bool, action: str, takeout_dir: Sequence[str]) -> None: +def merge(cache: bool, action: str, takeout_dir: Sequence[str], filter_: str) -> None: """ Parse and merge multiple takeout directories """ from .path_dispatch import TakeoutParser from .merge import cached_merge_takeouts, merge_events from .models import DEFAULT_MODEL_TYPE, Res + from .log import logger res: List[Res[DEFAULT_MODEL_TYPE]] = [] + filter_type: Optional[Type[DEFAULT_MODEL_TYPE]] if cache: + if filter_: + logger.warn( + "As it would otherwise re-compute every time, filtering happens after loading from cache" + ) res = list(cached_merge_takeouts(list(takeout_dir))) + if filter_: + filter_type = FILTER_OPTIONS[filter_] + res = [r for r in res if isinstance(r, filter_type)] else: - res = list(merge_events(*iter([TakeoutParser(p).parse(cache=False) for p in takeout_dir]))) # type: ignore[arg-type] + filter_type = FILTER_OPTIONS[filter_] if filter_ else None + res = list(merge_events(*iter([TakeoutParser(p).parse(cache=False, filter_type=filter_type) for p in takeout_dir]))) # type: ignore[arg-type] _handle_action(res, action) diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py index be1273b..6e7dc4c 100644 --- a/google_takeout_parser/models.py +++ b/google_takeout_parser/models.py @@ -9,6 +9,7 @@ from datetime import datetime from typing import ( Optional, + Type, List, Tuple, Any, @@ -24,6 +25,16 @@ from .log import logger +def get_union_args(cls: Any) -> Optional[Tuple[Type]]: # type: ignore[type-arg] + if getattr(cls, "__origin__", None) != Union: + return None + + args = cls.__args__ + args = [e for e in args if e != type(None)] # noqa: E721 + assert len(args) > 0 + return args # type: ignore + + class Subtitles(NamedTuple): name: str url: Optional[str] diff --git a/google_takeout_parser/path_dispatch.py b/google_takeout_parser/path_dispatch.py index 43db216..523f591 100644 --- a/google_takeout_parser/path_dispatch.py +++ b/google_takeout_parser/path_dispatch.py @@ -27,7 +27,7 @@ from .common import Res, PathIsh from .cache import takeout_cache_path from .log import logger -from .models import BaseEvent +from .models import BaseEvent, get_union_args from .parse_html.activity import _parse_html_activity from .parse_html.comment import _parse_html_comment_file @@ -59,7 +59,6 @@ def _handler_type_cache_key(handler: HandlerFunction) -> CacheKey: # Take a function like Iterator[Union[Item, Exception]] and return Item import inspect - from cachew.legacy import get_union_args sig = inspect.signature(handler) diff --git a/tests/test_types.py b/tests/test_types.py index 696ef73..480dc96 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -1,7 +1,7 @@ import inspect import google_takeout_parser.models as mod -from cachew.legacy import get_union_args +from google_takeout_parser.models import get_union_args def test_check_union() -> None: From 602ef235549f75f0c8549a10387a51f826e365c0 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sun, 1 Oct 2023 12:35:29 -0700 Subject: [PATCH 17/27] v0.1.5: allow parsing split html, add filter flag - adds a go script to parse large HTML into split files - path_dispatch knows to match the split files - adds a filter flag to filter to a specific model type - internal types in Activity are now namedtuples, which means the fields are tagged/more readable --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index d38caae..e15f364 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = google_takeout_parser -version = 0.1.4 +version = 0.1.5 description = Parses data out of your Google Takeout (History, Activity, Youtube, Locations, etc...) long_description = file: README.md long_description_content_type = text/markdown From 6df75eee049ede81bddae3b07bf5282246495091 Mon Sep 17 00:00:00 2001 From: seanbreckenridge Date: Sun, 1 Oct 2023 14:50:40 -0700 Subject: [PATCH 18/27] allowlist for http -> https, resolves #31 (#48) * allowlist for http -> https, resolves #31 * remove . prefix from google urls * convert suffix list to list, no reason for set * remove extra https check * mention converted to what --- google_takeout_parser/http_allowlist.py | 238 ++++++++++++++++++ google_takeout_parser/models.py | 14 +- google_takeout_parser/parse_html/activity.py | 19 +- google_takeout_parser/parse_html/comment.py | 10 +- .../parse_html/test_html_parsing.py | 10 +- google_takeout_parser/parse_json.py | 9 +- tests/test_urls.py | 34 +++ 7 files changed, 310 insertions(+), 24 deletions(-) create mode 100644 google_takeout_parser/http_allowlist.py create mode 100644 tests/test_urls.py diff --git a/google_takeout_parser/http_allowlist.py b/google_takeout_parser/http_allowlist.py new file mode 100644 index 0000000..f986671 --- /dev/null +++ b/google_takeout_parser/http_allowlist.py @@ -0,0 +1,238 @@ +import logging +from typing import Set, Optional, List + +from .log import logger + +from urllib.parse import urlsplit, urlunsplit + +# exact matches +CONVERT_HTTP: Set[str] = { + "m.youtube.com", + "www.youtube.com", + "youtube.com", + "bp0.blogger.com", +} + +# anything that ends with these domains +# curl -sL 'https://www.google.com/supported_domains +CONVERT_HTTP_SUFFIX: List[str] = [ + "google.com", + "google.ad", + "google.ae", + "google.com.af", + "google.com.ag", + "google.al", + "google.am", + "google.co.ao", + "google.com.ar", + "google.as", + "google.at", + "google.com.au", + "google.az", + "google.ba", + "google.com.bd", + "google.be", + "google.bf", + "google.bg", + "google.com.bh", + "google.bi", + "google.bj", + "google.com.bn", + "google.com.bo", + "google.com.br", + "google.bs", + "google.bt", + "google.co.bw", + "google.by", + "google.com.bz", + "google.ca", + "google.cd", + "google.cf", + "google.cg", + "google.ch", + "google.ci", + "google.co.ck", + "google.cl", + "google.cm", + "google.cn", + "google.com.co", + "google.co.cr", + "google.com.cu", + "google.cv", + "google.com.cy", + "google.cz", + "google.de", + "google.dj", + "google.dk", + "google.dm", + "google.com.do", + "google.dz", + "google.com.ec", + "google.ee", + "google.com.eg", + "google.es", + "google.com.et", + "google.fi", + "google.com.fj", + "google.fm", + "google.fr", + "google.ga", + "google.ge", + "google.gg", + "google.com.gh", + "google.com.gi", + "google.gl", + "google.gm", + "google.gr", + "google.com.gt", + "google.gy", + "google.com.hk", + "google.hn", + "google.hr", + "google.ht", + "google.hu", + "google.co.id", + "google.ie", + "google.co.il", + "google.im", + "google.co.in", + "google.iq", + "google.is", + "google.it", + "google.je", + "google.com.jm", + "google.jo", + "google.co.jp", + "google.co.ke", + "google.com.kh", + "google.ki", + "google.kg", + "google.co.kr", + "google.com.kw", + "google.kz", + "google.la", + "google.com.lb", + "google.li", + "google.lk", + "google.co.ls", + "google.lt", + "google.lu", + "google.lv", + "google.com.ly", + "google.co.ma", + "google.md", + "google.me", + "google.mg", + "google.mk", + "google.ml", + "google.com.mm", + "google.mn", + "google.com.mt", + "google.mu", + "google.mv", + "google.mw", + "google.com.mx", + "google.com.my", + "google.co.mz", + "google.com.na", + "google.com.ng", + "google.com.ni", + "google.ne", + "google.nl", + "google.no", + "google.com.np", + "google.nr", + "google.nu", + "google.co.nz", + "google.com.om", + "google.com.pa", + "google.com.pe", + "google.com.pg", + "google.com.ph", + "google.com.pk", + "google.pl", + "google.pn", + "google.com.pr", + "google.ps", + "google.pt", + "google.com.py", + "google.com.qa", + "google.ro", + "google.ru", + "google.rw", + "google.com.sa", + "google.com.sb", + "google.sc", + "google.se", + "google.com.sg", + "google.sh", + "google.si", + "google.sk", + "google.com.sl", + "google.sn", + "google.so", + "google.sm", + "google.sr", + "google.st", + "google.com.sv", + "google.td", + "google.tg", + "google.co.th", + "google.com.tj", + "google.tl", + "google.tm", + "google.tn", + "google.to", + "google.com.tr", + "google.tt", + "google.com.tw", + "google.co.tz", + "google.com.ua", + "google.co.ug", + "google.co.uk", + "google.com.uy", + "google.co.uz", + "google.com.vc", + "google.co.ve", + "google.co.vi", + "google.com.vn", + "google.vu", + "google.ws", + "google.rs", + "google.co.za", + "google.co.zm", + "google.co.zw", + "google.cat", +] + + +def _convert_to_https(url: str, logger: Optional[logging.Logger] = None) -> str: + uu = urlsplit(url) + if uu.scheme == "http": + if uu.netloc in CONVERT_HTTP: + return urlunsplit(("https",) + uu[1:]) + if any(uu.netloc.endswith(suffix) for suffix in CONVERT_HTTP_SUFFIX): + return urlunsplit(("https",) + uu[1:]) + if logger: + logger.debug( + "HTTP URL did not match allowlist: %s\nIf you think this should be auto-converted to HTTPS, make an issue here: https://github.com/seanbreckenridge/google_takeout_parser/issues/new", + url, + ) + # some other scheme, just return + return url + + +def _convert_to_https_opt( + url: Optional[str], logger: Optional[logging.Logger] = None +) -> Optional[str]: + if url is None: + return None + return _convert_to_https(url, logger) + + +def convert_to_https(url: str) -> str: + return _convert_to_https(url, logger=logger) + + +def convert_to_https_opt(url: Optional[str]) -> Optional[str]: + return _convert_to_https_opt(url, logger=logger) diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py index 6e7dc4c..6cf50d2 100644 --- a/google_takeout_parser/models.py +++ b/google_takeout_parser/models.py @@ -24,6 +24,8 @@ from .common import Res from .log import logger +Url = str + def get_union_args(cls: Any) -> Optional[Tuple[Type]]: # type: ignore[type-arg] if getattr(cls, "__origin__", None) != Union: @@ -37,14 +39,14 @@ def get_union_args(cls: Any) -> Optional[Tuple[Type]]: # type: ignore[type-arg] class Subtitles(NamedTuple): name: str - url: Optional[str] + url: Optional[Url] class LocationInfo(NamedTuple): name: Optional[str] - url: Optional[str] + url: Optional[Url] source: Optional[str] - sourceUrl: Optional[str] + sourceUrl: Optional[Url] class BaseEvent(Protocol): @@ -59,7 +61,7 @@ class Activity(BaseEvent): title: str time: datetime description: Optional[str] - titleUrl: Optional[str] + titleUrl: Optional[Url] # note: in HTML exports, there is no way to tell the difference between # a description and a subtitle, so they end up as subtitles # more lines of text describing this @@ -85,7 +87,7 @@ def key(self) -> Tuple[str, str, int]: class YoutubeComment(BaseEvent): content: str dt: datetime - urls: List[str] + urls: List[Url] @property def key(self) -> int: @@ -198,7 +200,7 @@ def otherCandidateLocations(self) -> List[CandidateLocation]: @dataclass class ChromeHistory(BaseEvent): title: str - url: str + url: Url dt: datetime @property diff --git a/google_takeout_parser/parse_html/activity.py b/google_takeout_parser/parse_html/activity.py index a2657fb..b4ce4b2 100644 --- a/google_takeout_parser/parse_html/activity.py +++ b/google_takeout_parser/parse_html/activity.py @@ -2,7 +2,6 @@ Parses the HTML MyActivity.html files that used to be the standard """ -import warnings from pathlib import Path from datetime import datetime from typing import List, Iterator, Optional, Tuple, Union, Dict, Iterable @@ -14,6 +13,7 @@ from ..models import Activity, Subtitles, LocationInfo from ..common import Res from ..log import logger +from ..http_allowlist import convert_to_https_opt from .html_time_utils import parse_html_dt @@ -90,11 +90,13 @@ def _parse_subtitles( if "href" in tag.attrs: url = tag.attrs["href"] else: - warnings.warn(f"Unexpected tag! {tag}") + logger.warning(f"Unexpected tag! {tag}") else: raise RuntimeError(f"Unexpected Type {tag} {type(tag)}") - parsed_subs.append(Subtitles(name=clean_latin1_chars(buf), url=url)) + parsed_subs.append( + Subtitles(name=clean_latin1_chars(buf), url=convert_to_https_opt(url)) + ) return parsed_subs, parse_html_dt(dt_raw, file_dt=file_dt) @@ -239,16 +241,16 @@ def _parse_caption( locationInfos.append( LocationInfo( name=name, - url=url, + url=convert_to_https_opt(url), source=source, - sourceUrl=sourceUrl, + sourceUrl=convert_to_https_opt(sourceUrl), ) ) elif header == "Details:": details.append(str(clean_latin1_chars(str(value[0])).strip())) else: - warnings.warn(f"Unexpected header in caption {header} {value}") + logger.warning(f"Unexpected header in caption {header} {value}") return details, locationInfos, products @@ -318,8 +320,9 @@ def _parse_activity_div( return Activity( header=header, - title=title_info[0], - titleUrl=title_info[1], # could be None, matched by model + title=title_info.name, + # could be None, matched the JSON format + titleUrl=convert_to_https_opt(title_info.url), description=None, # always none since we can't differentiate in HTML parsing time=dtime, locationInfos=locationInfos, diff --git a/google_takeout_parser/parse_html/comment.py b/google_takeout_parser/parse_html/comment.py index a3e2a28..4f25f5f 100644 --- a/google_takeout_parser/parse_html/comment.py +++ b/google_takeout_parser/parse_html/comment.py @@ -1,12 +1,13 @@ import re from pathlib import Path -from typing import Iterator +from typing import Iterator, List from datetime import datetime, timezone import bs4 from ..models import YoutubeComment from ..common import Res +from ..http_allowlist import convert_to_https from .activity import _group_by_brs, clean_latin1_chars # seems to always be in UTC? @@ -45,7 +46,10 @@ def _parse_html_li(li: bs4.element.Tag) -> YoutubeComment: desc += str(tag) elif isinstance(tag, bs4.element.Tag): desc += str(tag.text) - urls = list({link.attrs["href"] for link in li.select("a") if "href" in link.attrs}) + urls: List[str] = [] + for link in li.select("a"): + if "href" in link.attrs: + urls.append(convert_to_https(link.attrs["href"])) return YoutubeComment( content=clean_latin1_chars(desc).strip(), urls=urls, dt=parsed_date ) @@ -70,5 +74,5 @@ def test_parse_html_comment_file() -> None: assert parsed_li == YoutubeComment( content="content here", dt=datetime(2020, 4, 27, 23, 18, 23, tzinfo=timezone.utc), - urls=["http://www.youtube.com/watch?v=mM"], + urls=["https://www.youtube.com/watch?v=mM"], ) diff --git a/google_takeout_parser/parse_html/test_html_parsing.py b/google_takeout_parser/parse_html/test_html_parsing.py index 229df1c..984fd8d 100644 --- a/google_takeout_parser/parse_html/test_html_parsing.py +++ b/google_takeout_parser/parse_html/test_html_parsing.py @@ -2,6 +2,8 @@ from .activity import _parse_subtitles, _parse_caption, _is_location_api_link +# NOTE: some of the URLs here have been converted to http from https to test the http_allowlist.py conversion + # bring into scope from .comment import test_parse_html_comment_file # noqa: F401 from .html_time_utils import test_parse_dt # noqa: F401 @@ -15,7 +17,7 @@ def bs4_div(html: str) -> bs4.element.Tag: def test_parse_subtitles() -> None: content = bs4_div( - """""" + """""" ) res = _parse_subtitles(content, file_dt=None) assert not isinstance(res, Exception) @@ -39,7 +41,7 @@ def test_parse_subtitles() -> None: assert int(dt.timestamp()) == 1599242506 content = bs4_div( - """
1 notification
Including topics:
Emergency resources and information
Sep 1, 2020, 9:27:07 PM PDT
""", + """
1 notification
Including topics:
Emergency resources and information
Sep 1, 2020, 9:27:07 PM PDT
""", ) res = _parse_subtitles(content, file_dt=None) assert not isinstance(res, Exception) @@ -71,7 +73,7 @@ def test_parse_captions() -> None: def test_parse_locations() -> None: content = bs4_div( - """
Products:
 Discover
Locations:
 At this general area - From your places (Home)
""" + """
Products:
 Discover
Locations:
 At this general area - From your places (Home)
""" ) details, locationInfos, products = _parse_caption(content) @@ -89,7 +91,7 @@ def test_parse_locations() -> None: ] content = bs4_div( - """
Products:
 Maps
Locations:
 At this general area - Based on your past activity
""" + """
Products:
 Maps
Locations:
 At this general area - Based on your past activity
""" ) details, locationInfos, products = _parse_caption(content) diff --git a/google_takeout_parser/parse_json.py b/google_takeout_parser/parse_json.py index 9fd4972..2ad562e 100644 --- a/google_takeout_parser/parse_json.py +++ b/google_takeout_parser/parse_json.py @@ -7,6 +7,7 @@ from datetime import datetime, timezone from typing import Iterator, Any, Dict, Iterable, Optional, List +from .http_allowlist import convert_to_https_opt from .time_utils import parse_datetime_millis from .models import ( Subtitles, @@ -54,7 +55,7 @@ def _parse_json_activity(p: Path) -> Iterator[Res[Activity]]: yield Activity( header=header, title=blob["title"], - titleUrl=blob.get("titleUrl"), + titleUrl=convert_to_https_opt(blob.get("titleUrl")), description=blob.get("description"), time=parse_json_utc_date(time_str), subtitles=subtitles, @@ -66,9 +67,9 @@ def _parse_json_activity(p: Path) -> Iterator[Res[Activity]]: locationInfos=[ LocationInfo( name=locinfo.get("name"), - url=locinfo.get("url"), + url=convert_to_https_opt(locinfo.get("url")), source=locinfo.get("source"), - sourceUrl=locinfo.get("sourceUrl"), + sourceUrl=convert_to_https_opt(locinfo.get("sourceUrl")), ) for locinfo in blob.get("locationInfos", []) ], @@ -215,6 +216,8 @@ def _parse_chrome_history(p: Path) -> Iterator[Res[ChromeHistory]]: time_naive = datetime.utcfromtimestamp(item["time_usec"] / 10**6) yield ChromeHistory( title=item["title"], + # dont convert to https here, this is just the users history + # and theres likely lots of items that arent https url=item["url"], dt=time_naive.replace(tzinfo=timezone.utc), ) diff --git a/tests/test_urls.py b/tests/test_urls.py new file mode 100644 index 0000000..f41fee6 --- /dev/null +++ b/tests/test_urls.py @@ -0,0 +1,34 @@ +import logging +from google_takeout_parser.http_allowlist import _convert_to_https + + +def test__convert_to_https(caplog) -> None: + url = "http://www.google.com" + assert _convert_to_https(url) == "https://www.google.com" + + url = "http://youtube.com" + assert _convert_to_https(url) == "https://youtube.com" + + url = "https://youtube.com" + assert _convert_to_https(url) == "https://youtube.com" + + url = "http://maps.google.com/something+else" + assert _convert_to_https(url) == "https://maps.google.com/something+else" + + from logzero import logger + + logger.propagate = True + + # catpure logs + url = "http://www.otherurl.com" + + caplog.clear() + + with caplog.at_level(logging.DEBUG): + assert _convert_to_https(url, logger) == "http://www.otherurl.com" + + assert len(caplog.records) == 1 + assert ( + "HTTP URL did not match allowlist: http://www.otherurl.com\nIf you think this should be auto-converted to HTTPS, make an issue here:" + in caplog.records[0].message + ) From b23af213abde92b0f99d7921637aa16bb67d6b67 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sun, 1 Oct 2023 15:55:36 -0700 Subject: [PATCH 19/27] match all youtube links, stricter tests --- google_takeout_parser/http_allowlist.py | 21 ++++++++--------- tests/test_urls.py | 30 +++++++++++++++---------- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/google_takeout_parser/http_allowlist.py b/google_takeout_parser/http_allowlist.py index f986671..a3fec83 100644 --- a/google_takeout_parser/http_allowlist.py +++ b/google_takeout_parser/http_allowlist.py @@ -1,21 +1,17 @@ import logging -from typing import Set, Optional, List +from typing import Set, Optional from .log import logger from urllib.parse import urlsplit, urlunsplit -# exact matches -CONVERT_HTTP: Set[str] = { - "m.youtube.com", - "www.youtube.com", - "youtube.com", - "bp0.blogger.com", -} +CONVERT_HTTP: Set[str] = set() # anything that ends with these domains # curl -sL 'https://www.google.com/supported_domains -CONVERT_HTTP_SUFFIX: List[str] = [ +CONVERT_HTTP_SUFFIX: Set[str] = { + "youtube.com", + "bp0.blogger.com", "google.com", "google.ad", "google.ae", @@ -203,15 +199,16 @@ "google.co.zm", "google.co.zw", "google.cat", -] +} def _convert_to_https(url: str, logger: Optional[logging.Logger] = None) -> str: uu = urlsplit(url) if uu.scheme == "http": - if uu.netloc in CONVERT_HTTP: + without_www = uu.netloc[4:] if uu.netloc.startswith("www.") else uu.netloc + if without_www in CONVERT_HTTP or without_www in CONVERT_HTTP_SUFFIX: return urlunsplit(("https",) + uu[1:]) - if any(uu.netloc.endswith(suffix) for suffix in CONVERT_HTTP_SUFFIX): + if any(without_www.endswith(suffix) for suffix in CONVERT_HTTP_SUFFIX): return urlunsplit(("https",) + uu[1:]) if logger: logger.debug( diff --git a/tests/test_urls.py b/tests/test_urls.py index f41fee6..cdd97f3 100644 --- a/tests/test_urls.py +++ b/tests/test_urls.py @@ -3,24 +3,30 @@ def test__convert_to_https(caplog) -> None: - url = "http://www.google.com" - assert _convert_to_https(url) == "https://www.google.com" + with caplog.at_level(logging.DEBUG): + url = "http://www.google.com" + assert _convert_to_https(url) == "https://www.google.com" + + url = "http://youtube.com" + assert _convert_to_https(url) == "https://youtube.com" + + url = "https://youtube.com" + assert _convert_to_https(url) == "https://youtube.com" - url = "http://youtube.com" - assert _convert_to_https(url) == "https://youtube.com" + url = "http://maps.google.com/something+else" + assert _convert_to_https(url) == "https://maps.google.com/something+else" - url = "https://youtube.com" - assert _convert_to_https(url) == "https://youtube.com" + url = "http://m.youtube.com/watch?v=123" + assert _convert_to_https(url) == "https://m.youtube.com/watch?v=123" - url = "http://maps.google.com/something+else" - assert _convert_to_https(url) == "https://maps.google.com/something+else" + from logzero import logger - from logzero import logger + logger.propagate = True - logger.propagate = True + # catpure logs + url = "http://www.otherurl.com" - # catpure logs - url = "http://www.otherurl.com" + assert len(caplog.records) == 0 caplog.clear() From 431506e73a7bac17fe9715fa41d3b81f2feb8f08 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Sun, 1 Oct 2023 16:09:30 -0700 Subject: [PATCH 20/27] explicitly match subdomains with endswith check, add www test --- google_takeout_parser/http_allowlist.py | 4 +++- tests/test_urls.py | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/google_takeout_parser/http_allowlist.py b/google_takeout_parser/http_allowlist.py index a3fec83..cf4ebfa 100644 --- a/google_takeout_parser/http_allowlist.py +++ b/google_takeout_parser/http_allowlist.py @@ -208,7 +208,9 @@ def _convert_to_https(url: str, logger: Optional[logging.Logger] = None) -> str: without_www = uu.netloc[4:] if uu.netloc.startswith("www.") else uu.netloc if without_www in CONVERT_HTTP or without_www in CONVERT_HTTP_SUFFIX: return urlunsplit(("https",) + uu[1:]) - if any(without_www.endswith(suffix) for suffix in CONVERT_HTTP_SUFFIX): + # check if this is a subdomain of a domain in the allowlist + # like m.youtube.com + if any(without_www.endswith("." + suffix) for suffix in CONVERT_HTTP_SUFFIX): return urlunsplit(("https",) + uu[1:]) if logger: logger.debug( diff --git a/tests/test_urls.py b/tests/test_urls.py index cdd97f3..6861a5a 100644 --- a/tests/test_urls.py +++ b/tests/test_urls.py @@ -10,6 +10,9 @@ def test__convert_to_https(caplog) -> None: url = "http://youtube.com" assert _convert_to_https(url) == "https://youtube.com" + url = "http://www.youtube.com" + assert _convert_to_https(url) == "https://www.youtube.com" + url = "https://youtube.com" assert _convert_to_https(url) == "https://youtube.com" From fa2f52332b86ce7756e6bd9eb5aee997b44c9e79 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Wed, 4 Oct 2023 22:12:26 -0700 Subject: [PATCH 21/27] remove unused requirements file --- requirements.txt | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 0f0d686..0000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -click>=8.0 -logzero>=1.7.0 -lxml>=4.6.0 -beautifulsoup4>=4.9.0 -cachew>=0.11.0 -pytz>=2021.3 -IPython -platformdirs>=2.3.0 From 98edd283a57df1a4e0870e8736176c73a3d8ae4b Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Wed, 4 Oct 2023 22:56:10 -0700 Subject: [PATCH 22/27] update docs --- google_takeout_parser/http_allowlist.py | 9 +++++++++ google_takeout_parser/parse_json.py | 2 +- split_html/README.md | 2 +- tests/test_urls.py | 2 +- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/google_takeout_parser/http_allowlist.py b/google_takeout_parser/http_allowlist.py index cf4ebfa..cf22bb8 100644 --- a/google_takeout_parser/http_allowlist.py +++ b/google_takeout_parser/http_allowlist.py @@ -1,3 +1,12 @@ +""" +For context, see: https://github.com/seanbreckenridge/google_takeout_parser/issues/31 + +This converts HTTP URLs to HTTPS, if they're from certain google domains. +In some cases URLs in the takeout are HTTP for no reason, and converting them +to HTTPs is nicer for downstream consumers, e.g. to dedupe, parsing from multiple +sources +""" + import logging from typing import Set, Optional diff --git a/google_takeout_parser/parse_json.py b/google_takeout_parser/parse_json.py index 2ad562e..2bc5819 100644 --- a/google_takeout_parser/parse_json.py +++ b/google_takeout_parser/parse_json.py @@ -217,7 +217,7 @@ def _parse_chrome_history(p: Path) -> Iterator[Res[ChromeHistory]]: yield ChromeHistory( title=item["title"], # dont convert to https here, this is just the users history - # and theres likely lots of items that arent https + # and there's likely lots of items that aren't https url=item["url"], dt=time_naive.replace(tzinfo=timezone.utc), ) diff --git a/split_html/README.md b/split_html/README.md index 791d90e..5a9b4ae 100644 --- a/split_html/README.md +++ b/split_html/README.md @@ -35,7 +35,7 @@ mv ~/data/takeout/something/MyActivity/Youtube/MyActivity.html /tmp google_takeout_parser merge -a summary ~/data/takeout/something ``` -This splits the files into dozens of files about `~700K` instead of the giant HTML files +This splits the `100MB+` HTML files into dozens of small files sized about `~700K`. I personally **created copies** of all of my HTML exports, and did: diff --git a/tests/test_urls.py b/tests/test_urls.py index 6861a5a..4a05125 100644 --- a/tests/test_urls.py +++ b/tests/test_urls.py @@ -26,7 +26,7 @@ def test__convert_to_https(caplog) -> None: logger.propagate = True - # catpure logs + # capture logs url = "http://www.otherurl.com" assert len(caplog.records) == 0 From 229f0f12debcbc7f221e714e4f6feb739d453ca8 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Fri, 6 Oct 2023 11:30:33 -0700 Subject: [PATCH 23/27] update example to use new filter flag --- README.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 95daae2..5214226 100644 --- a/README.md +++ b/README.md @@ -106,11 +106,14 @@ Counter({'Activity': 366292, 'ChromeHistory': 4}) ``` -Can also dump the info to JSON; e.g. to filter YouTube links from your Activity: +Can also dump the info to JSON; e.g. to filter YouTube-related stuff from your Activity using [jq](https://jqlang.github.io/jq/): ```bash -google_takeout_parser parse -a json --no-cache ./Takeout-New \ - | jq '.[] | select(.type == "Activity") | select(.header == "YouTube") | .titleUrl' +google_takeout_parser --quiet parse -a json -f Activity --no-cache ./Takeout-New | \ + # select stuff like Youtube, m.youtube.com, youtube.com using jq + jq '.[] | select(.header | ascii_downcase | test("youtube")) | select(.titleUrl)' | \ + # grab the titleUrl + jq .titleUrl -r ``` Also contains a small utility command to help move/extract the google takeout: @@ -167,7 +170,7 @@ If you don't want to cache the results but want to merge results from multiple t from google_takeout_parser.merge import merge_events, TakeoutParser itrs = [] # list of iterators of google events for path in ['path/to/Takeout-1599315526' 'path/to/Takeout-1616796262']: - # ignore errors + # ignore errors, error_policy can be 'yield', 'raise' or 'drop' tk = TakeoutParser(path, error_policy="drop") itrs.append(tk.parse(cache=False)) res = list(merge_events(*itrs)) From 8312d7fd45125056e778d7b318179c8f2a9078ab Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Fri, 6 Oct 2023 15:31:19 -0700 Subject: [PATCH 24/27] simplify jq example --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5214226..6f9c423 100644 --- a/README.md +++ b/README.md @@ -109,11 +109,11 @@ Counter({'Activity': 366292, Can also dump the info to JSON; e.g. to filter YouTube-related stuff from your Activity using [jq](https://jqlang.github.io/jq/): ```bash -google_takeout_parser --quiet parse -a json -f Activity --no-cache ./Takeout-New | \ +google_takeout_parser --quiet parse -a json -f Activity --no-cache ./Takeout-New | # select stuff like Youtube, m.youtube.com, youtube.com using jq - jq '.[] | select(.header | ascii_downcase | test("youtube")) | select(.titleUrl)' | \ - # grab the titleUrl - jq .titleUrl -r + jq '.[] | select(.header | ascii_downcase | test("youtube"))' | + # grab the titleUrl, ignoring nulls + jq 'select(.titleUrl) | .titleUrl' -r ``` Also contains a small utility command to help move/extract the google takeout: From b9c05a5b042cbbdb78ea0f2efd6987f15fe5c464 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Fri, 6 Oct 2023 18:24:08 -0700 Subject: [PATCH 25/27] remove useless function call --- google_takeout_parser/parse_json.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/google_takeout_parser/parse_json.py b/google_takeout_parser/parse_json.py index 2bc5819..0c39640 100644 --- a/google_takeout_parser/parse_json.py +++ b/google_takeout_parser/parse_json.py @@ -120,10 +120,6 @@ def _parse_timestamp_key(d: Dict[str, Any], key: str) -> datetime: return parse_json_utc_date(d[key]) -def _parse_location_timestamp(d: Dict[str, Any]) -> datetime: - return _parse_timestamp_key(d, "timestamp") - - def _parse_location_history(p: Path) -> Iterator[Res[Location]]: ### HMMM, seems that all the locations are right after one another. broken? May just be all the location history that google has on me ### see numpy.diff(list(map(lambda yy: y.at, filter(lambda y: isinstance(Location), events())))) @@ -136,7 +132,7 @@ def _parse_location_history(p: Path) -> Iterator[Res[Location]]: yield Location( lng=float(loc["longitudeE7"]) / 1e7, lat=float(loc["latitudeE7"]) / 1e7, - dt=_parse_location_timestamp(loc), + dt=_parse_timestamp_key(loc, "timestamp"), accuracy=None if accuracy is None else float(accuracy), ) except Exception as e: From ee20415363dbdeac9b40d1c3161b9953b6b4da57 Mon Sep 17 00:00:00 2001 From: karlicoss Date: Mon, 16 Oct 2023 22:19:01 +0100 Subject: [PATCH 26/27] semantic locaion history parser: handle missing placeId and placeConfidence (#50) --- google_takeout_parser/models.py | 2 +- google_takeout_parser/parse_json.py | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py index 6cf50d2..7fa9360 100644 --- a/google_takeout_parser/models.py +++ b/google_takeout_parser/models.py @@ -169,7 +169,7 @@ class PlaceVisit(BaseEvent): sourceInfoDeviceTag: Optional[int] otherCandidateLocationsJSON: str # TODO: parse these into an enum of some kind? may be prone to breaking due to new values from google though... - placeConfidence: str + placeConfidence: Optional[str] # older semantic history (pre-2018 didn't have it) placeVisitType: Optional[str] visitConfidence: float editConfirmationStatus: str diff --git a/google_takeout_parser/parse_json.py b/google_takeout_parser/parse_json.py index 0c39640..840ebf9 100644 --- a/google_takeout_parser/parse_json.py +++ b/google_takeout_parser/parse_json.py @@ -9,6 +9,7 @@ from .http_allowlist import convert_to_https_opt from .time_utils import parse_datetime_millis +from .log import logger from .models import ( Subtitles, LocationInfo, @@ -140,6 +141,11 @@ def _parse_location_history(p: Path) -> Iterator[Res[Location]]: _sem_required_keys = ["location", "duration"] +_sem_required_location_keys = [ + "placeId", # some fairly recent (as of 2023) places might miss it + "latitudeE7", + "longitudeE7", +] def _check_required_keys( @@ -168,7 +174,13 @@ def _parse_semantic_location_history(p: Path) -> Iterator[Res[PlaceVisit]]: yield RuntimeError(f"PlaceVisit: no '{missing_key}' key in '{p}'") continue try: - location = CandidateLocation.from_dict(placeVisit["location"]) + location_json = placeVisit["location"] + missing_location_key = _check_required_keys(location_json, _sem_required_location_keys) + if missing_location_key is not None: + # handle these fully defensively, since nothing at all we can do if it's missing these properties + logger.debug(f"CandidateLocation: {p}, no key '{missing_location_key}' in {location_json}") + continue + location = CandidateLocation.from_dict(location_json) duration = placeVisit["duration"] yield PlaceVisit( name=location.name, @@ -178,7 +190,7 @@ def _parse_semantic_location_history(p: Path) -> Iterator[Res[PlaceVisit]]: placeVisit.get("otherCandidateLocations", []), separators=(",", ":") ), sourceInfoDeviceTag=location.sourceInfoDeviceTag, - placeConfidence=placeVisit["placeConfidence"], + placeConfidence=placeVisit.get("placeConfidence"), placeVisitImportance=placeVisit.get("placeVisitImportance"), placeVisitType=placeVisit.get("placeVisitType"), visitConfidence=placeVisit["visitConfidence"], From be81ed489a22300965aa55e5451a50279933fa76 Mon Sep 17 00:00:00 2001 From: seanbreckenridge Date: Tue, 24 Oct 2023 09:39:22 -0700 Subject: [PATCH 27/27] v0.1.6: semantic_location: remove JSON field (#52) --- .github/workflows/ci.yaml | 9 +++--- google_takeout_parser/models.py | 16 +---------- google_takeout_parser/parse_json.py | 15 ++++++---- setup.cfg | 3 +- tests/test_json.py | 44 +++++++++++++---------------- tests/test_split_html.py | 2 +- tests/test_urls.py | 5 ++-- 7 files changed, 41 insertions(+), 53 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0a395b6..2e85f77 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -11,10 +11,11 @@ jobs: strategy: matrix: platform: [ubuntu-latest, windows-latest] - python-version: [3.8, 3.9, "3.10", "3.11"] + python-version: [3.8, 3.9, "3.10", "3.11", "3.12"] exclude: [ {platform: windows-latest, python-version: "3.9"}, - {platform: windows-latest, python-version: "3.10"} + {platform: windows-latest, python-version: "3.10"}, + {platform: windows-latest, python-version: "3.11"} ] runs-on: ${{ matrix.platform }} @@ -31,10 +32,10 @@ jobs: pip install '.[testing]' - name: Run mypy run: | - mypy --install-types --non-interactive ./google_takeout_parser + mypy --install-types --non-interactive ./google_takeout_parser ./tests - name: Run pytest run: | pytest - name: Run flake8 run: | - flake8 ./google_takeout_parser + flake8 ./google_takeout_parser ./tests diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py index 7fa9360..884bad0 100644 --- a/google_takeout_parser/models.py +++ b/google_takeout_parser/models.py @@ -22,7 +22,6 @@ from dataclasses import dataclass from .common import Res -from .log import logger Url = str @@ -167,7 +166,7 @@ class PlaceVisit(BaseEvent): startTime: datetime endTime: datetime sourceInfoDeviceTag: Optional[int] - otherCandidateLocationsJSON: str + otherCandidateLocations: List[CandidateLocation] # TODO: parse these into an enum of some kind? may be prone to breaking due to new values from google though... placeConfidence: Optional[str] # older semantic history (pre-2018 didn't have it) placeVisitType: Optional[str] @@ -183,19 +182,6 @@ def dt(self) -> datetime: # type: ignore[override] def key(self) -> Tuple[float, float, int, Optional[float]]: return self.lat, self.lng, int(self.startTime.timestamp()), self.visitConfidence - @property - def otherCandidateLocations(self) -> List[CandidateLocation]: - import json - - loaded = json.loads(self.otherCandidateLocationsJSON) - if not isinstance(loaded, list): - logger.warning( - f"loading candidate locations: expected list, got {type(loaded)}, {loaded}" - ) - return [] - - return [CandidateLocation.from_dict(x) for x in loaded] - @dataclass class ChromeHistory(BaseEvent): diff --git a/google_takeout_parser/parse_json.py b/google_takeout_parser/parse_json.py index 840ebf9..49b6032 100644 --- a/google_takeout_parser/parse_json.py +++ b/google_takeout_parser/parse_json.py @@ -175,10 +175,14 @@ def _parse_semantic_location_history(p: Path) -> Iterator[Res[PlaceVisit]]: continue try: location_json = placeVisit["location"] - missing_location_key = _check_required_keys(location_json, _sem_required_location_keys) + missing_location_key = _check_required_keys( + location_json, _sem_required_location_keys + ) if missing_location_key is not None: # handle these fully defensively, since nothing at all we can do if it's missing these properties - logger.debug(f"CandidateLocation: {p}, no key '{missing_location_key}' in {location_json}") + logger.debug( + f"CandidateLocation: {p}, no key '{missing_location_key}' in {location_json}" + ) continue location = CandidateLocation.from_dict(location_json) duration = placeVisit["duration"] @@ -186,9 +190,10 @@ def _parse_semantic_location_history(p: Path) -> Iterator[Res[PlaceVisit]]: name=location.name, address=location.address, # separators=(",", ":") removes whitespace from json.dumps - otherCandidateLocationsJSON=json.dumps( - placeVisit.get("otherCandidateLocations", []), separators=(",", ":") - ), + otherCandidateLocations=[ + CandidateLocation.from_dict(pv) + for pv in placeVisit.get("otherCandidateLocations", []) + ], sourceInfoDeviceTag=location.sourceInfoDeviceTag, placeConfidence=placeVisit.get("placeConfidence"), placeVisitImportance=placeVisit.get("placeVisitImportance"), diff --git a/setup.cfg b/setup.cfg index e15f364..8439a51 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = google_takeout_parser -version = 0.1.5 +version = 0.1.6 description = Parses data out of your Google Takeout (History, Activity, Youtube, Locations, etc...) long_description = file: README.md long_description_content_type = text/markdown @@ -18,6 +18,7 @@ classifiers = Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 keywords = google data parsing [options] diff --git a/tests/test_json.py b/tests/test_json.py index 5d1bbd1..9616e2f 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -1,5 +1,4 @@ import json -import dataclasses import datetime from pathlib import Path from typing import Iterator, Any @@ -37,13 +36,13 @@ def test_parse_activity_json(tmp_path_f: Path) -> None: description=None, titleUrl=None, subtitles=[ - ("Computer programming", None), - ("Computer Science", None), - ("PostgreSQL", None), - ("Technology", None), + models.Subtitles("Computer programming", None), + models.Subtitles("Computer Science", None), + models.Subtitles("PostgreSQL", None), + models.Subtitles("Technology", None), ], locationInfos=[ - ( + models.LocationInfo( "At this general area", "https://www.google.com/maps/@?api=1&map_action=map¢er=lat,lon&zoom=12", "From your Location History", @@ -98,7 +97,7 @@ def test_parse_app_installs(tmp_path_f: Path) -> None: ] -def test_location_old(tmp_path_f) -> None: +def test_location_old(tmp_path_f: Path) -> None: contents = '{"locations": [{"timestampMs": "1512947698030", "latitudeE7": 351324213, "longitudeE7": -1122434441, "accuracy": 10}]}' fp = tmp_path_f / "file" fp.write_text(contents) @@ -191,12 +190,9 @@ def test_semantic_location_history(tmp_path_f: Path) -> None: fp = tmp_path_f / "file" fp.write_text(json.dumps(data)) res = list(prj._parse_semantic_location_history(fp)) - objbase = res[0] - assert not isinstance(objbase, Exception) + obj = res[0] + assert not isinstance(obj, Exception) # remove JSON, compare manually below - objd = dataclasses.asdict(objbase) - del objd["otherCandidateLocationsJSON"] - obj = models.PlaceVisit(**objd, otherCandidateLocationsJSON="{}") assert obj == models.PlaceVisit( lat=55.5555555, lng=-106.6666666, @@ -213,22 +209,20 @@ def test_semantic_location_history(tmp_path_f: Path) -> None: 2017, 12, 11, 1, 20, 6, 106000, tzinfo=datetime.timezone.utc ), sourceInfoDeviceTag=987654321, - otherCandidateLocationsJSON="{}", placeConfidence="MEDIUM_CONFIDENCE", placeVisitImportance="MAIN", placeVisitType="SINGLE_PLACE", visitConfidence=65.45, editConfirmationStatus="NOT_CONFIRMED", + otherCandidateLocations=[ + models.CandidateLocation( + lat=42.3984239, + lng=-156.5656565, + name="name2", + address="address2", + locationConfidence=24.475897, + placeId="XPRK4E4P", + sourceInfoDeviceTag=None, + ) + ], ) - - assert objbase.otherCandidateLocations == [ - models.CandidateLocation( - lat=42.3984239, - lng=-156.5656565, - name="name2", - address="address2", - locationConfidence=24.475897, - placeId="XPRK4E4P", - sourceInfoDeviceTag=None, - ) - ] diff --git a/tests/test_split_html.py b/tests/test_split_html.py index 8dedc08..f9c7ced 100644 --- a/tests/test_split_html.py +++ b/tests/test_split_html.py @@ -33,7 +33,7 @@ def in_golang_dir() -> Generator[None, None, None]: "TEST_GOLANG_SPLIT" not in os.environ, reason="TEST_GOLANG_SPLIT not set, skipping test", ) -def test_split_html(in_golang_dir) -> None: +def test_split_html(in_golang_dir: None) -> None: with tempfile.TemporaryDirectory() as temp_dir: subprocess.run( [ diff --git a/tests/test_urls.py b/tests/test_urls.py index 4a05125..983b67d 100644 --- a/tests/test_urls.py +++ b/tests/test_urls.py @@ -1,8 +1,9 @@ import logging +from pytest import LogCaptureFixture from google_takeout_parser.http_allowlist import _convert_to_https -def test__convert_to_https(caplog) -> None: +def test_convert_to_https(caplog: LogCaptureFixture) -> None: with caplog.at_level(logging.DEBUG): url = "http://www.google.com" assert _convert_to_https(url) == "https://www.google.com" @@ -22,7 +23,7 @@ def test__convert_to_https(caplog) -> None: url = "http://m.youtube.com/watch?v=123" assert _convert_to_https(url) == "https://m.youtube.com/watch?v=123" - from logzero import logger + from logzero import logger # type: ignore[import] logger.propagate = True