Merge branch 'master' into handlerMap_localization

purarue · Oct 24, 2023 · d466435 · d466435
2 parents 5d2c905 + be81ed4
commit d466435
Show file tree

Hide file tree

Showing 30 changed files with 1,232 additions and 437 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -2,19 +2,20 @@ name: CI
 
 on:
   push:
-    branches: [master]
+    branches: ['*']
   pull_request:
-    branches: [master]
+    branches: ['*']
 
 jobs:
   build:
     strategy:
       matrix:
         platform: [ubuntu-latest, windows-latest]
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.8, 3.9, "3.10", "3.11", "3.12"]
         exclude: [
-          {platform: windows-latest, python-version: "3.8"},
-          {platform: windows-latest, python-version: "3.9"}
+          {platform: windows-latest, python-version: "3.9"},
+          {platform: windows-latest, python-version: "3.10"},
+          {platform: windows-latest, python-version: "3.11"}
         ]
 
     runs-on: ${{ matrix.platform }}
@@ -31,10 +32,10 @@ jobs:
           pip install '.[testing]'
       - name: Run mypy
         run: |
-          mypy --install-types --non-interactive ./google_takeout_parser
+          mypy --install-types --non-interactive ./google_takeout_parser ./tests
       - name: Run pytest
         run: |
           pytest
       - name: Run flake8
         run: |
-          flake8 ./google_takeout_parser
+          flake8 ./google_takeout_parser ./tests
diff --git a/README.md b/README.md
@@ -12,6 +12,7 @@ Parses data out of your [Google Takeout](https://takeout.google.com/) (History,
 - [Usage](#usage)
   - [CLI Usage](#cli-usage)
   - [Library Usage](#library-usage)
+- [Legacy HTML Parsing](#legacy-html-parsing)
 - [Contributing](#contributing)
 - [Testing](#testing)
 
@@ -31,6 +32,8 @@ Since the Takeout slowly removes old events over time, I would recommend periodi
   - Select JSON as format
   - In options, deselect `music-library-songs`, `music-uploads` and `videos`
 
+**Be sure to select JSON whenever possible**. Code to parse the HTML format is included here, but it is treated as legacy code and comes with worse performance and a myriad of other issues. See [legacy html parsing](#legacy-html-parsing)
+
 The process for getting these isn't that great -- you have to manually go to [takeout.google.com](https://takeout.google.com) every few months, select what you want to export info for, and then it puts the zipped file into your google drive. You can tell it to run it at specific intervals, but I personally haven't found that to be that reliable.
 
 This currently parses:
@@ -51,18 +54,32 @@ This was extracted out of [my HPI](https://github.com/seanbreckenridge/HPI/tree/
 
 ## Installation
 
-Requires `python3.7+`
+Requires `python3.8+`
 
 To install with pip, run:
 
-    pip install google_takeout_parser
+    pip install google-takeout-parser
 
 ## Usage
 
 ### CLI Usage
 
 Can be accessed by either `google_takeout_parser` or `python -m google_takeout_parser`. Offers a basic interface to list/clear the cache directory, and/or parse/merge a takeout and interact with it in a REPL:
 
+```
+Usage: google_takeout_parser parse [OPTIONS] TAKEOUT_DIR
+
+  Parse a takeout directory takeout
+
+Options:
+  -f, --filter [Activity|LikedYoutubeVideo|PlayStoreAppInstall|Location|ChromeHistory|YoutubeComment|PlaceVisit]
+                                  Filter to only show events of this type
+  -a, --action [repl|summary|json]
+                                  What to do with the parsed result  [default: repl]
+  --cache / --no-cache            [default: no-cache]
+  -h, --help                      Show this message and exit.
+```
+
 To clear the `cachew` cache: `google_takeout_parser cache_dir clear`
 
 A few examples of parsing takeouts:
@@ -89,11 +106,14 @@ Counter({'Activity': 366292,
          'ChromeHistory': 4})
 ```
 
-Can also dump the info to JSON; e.g. to filter YouTube links from your Activity:
+Can also dump the info to JSON; e.g. to filter YouTube-related stuff from your Activity using [jq](https://jqlang.github.io/jq/):
 
 ```bash
-google_takeout_parser parse -a json --no-cache ./Takeout-New \
-  | jq '.[] | select(.type == "Activity") | select(.header == "YouTube") | .titleUrl'
+google_takeout_parser --quiet parse -a json -f Activity --no-cache ./Takeout-New |
+  # select stuff like Youtube, m.youtube.com, youtube.com using jq
+  jq '.[] | select(.header | ascii_downcase | test("youtube"))' |
+  # grab the titleUrl, ignoring nulls
+  jq 'select(.titleUrl) | .titleUrl' -r
 ```
 
 Also contains a small utility command to help move/extract the google takeout:
@@ -150,7 +170,7 @@ If you don't want to cache the results but want to merge results from multiple t
 from google_takeout_parser.merge import merge_events, TakeoutParser
 itrs = []  # list of iterators of google events
 for path in ['path/to/Takeout-1599315526' 'path/to/Takeout-1616796262']:
-    # ignore errors
+    # ignore errors, error_policy can be 'yield', 'raise' or 'drop'
     tk = TakeoutParser(path, error_policy="drop")
     itrs.append(tk.parse(cache=False))
 res = list(merge_events(*itrs))
@@ -168,13 +188,18 @@ len(locations)
 
 I personally exclusively use this through the [HPI google takeout](https://github.com/karlicoss/HPI/blob/master/my/google/takeout/parser.py) file, as a configuration layer to locate where my takeouts are on disk, and since that 'automatically' unzips the takeouts (I store them as the zips), i.e., doesn't require me to maintain an unpacked view
 
+### Legacy HTML Parsing
+
+I would _heavily recommend against_ using the HTML format for `My Activity`. It is not always possible to properly parse the metadata, is more prone to errors parsing dates due to local timezones, and takes much longer to parse than the JSON format.
+
+On certain machines, the giant HTML files may even take so much memory that the process is eventually killed for using too much memory. For a workaround, see [split_html](./split_html).
+
 ### Contributing
 
 Just to give a brief overview, to add new functionality (parsing some new folder that this doesn't currently support), you'd need to:
 
 - Add a `model` for it in [`models.py`](google_takeout_parser/models.py) subclassing `BaseEvent` and adding it to the Union at the bottom of the file. That should have a `key` property function which describes each event uniquely (used to merge takeout events)
 - Write a function which takes the `Path` to the file you're trying to parse and converts it to the model you created (See examples in [`parse_json.py`](google_takeout_parser/parse_json.py)). Ideally extract a single raw item from the takeout file add a test for it so its obvious when/if the format changes.
-- Set [the `return_type`](https://github.com/seanbreckenridge/google_takeout_parser/blob/7b1ee8ec3c3f36e6f279f20a9a214b6a3e8775f5/google_takeout_parser/parse_json.py#L71) property on the function, to use for caching/filtering
 - Add a regex match for the file path to the [`DEFAULT_HANDLER_MAP`](https://github.com/seanbreckenridge/google_takeout_parser/blob/2bd64b7373e4a2ac2ace32e03b25ca3b7e901034/google_takeout_parser/path_dispatch.py#L48)
 
 ### Testing

diff --git a/google_takeout_parser/__init__.py b/google_takeout_parser/__init__.py
@@ -1,10 +1,6 @@
-from pkg_resources import get_distribution, DistributionNotFound
+import importlib.metadata
 
-try:
-    # Change here if project is renamed and does not equal the package name
-    dist_name = __name__
-    __version__ = get_distribution(dist_name).version
-except DistributionNotFound:
-    __version__ = "unknown"
-finally:
-    del get_distribution, DistributionNotFound
+# Change here if project is renamed and does not equal the package name
+__version__ = importlib.metadata.version(__name__)
+
+del importlib
diff --git a/google_takeout_parser/__main__.py b/google_takeout_parser/__main__.py
@@ -2,12 +2,14 @@
 import json
 from datetime import datetime, date
 import dataclasses
-from typing import List, Optional, Callable, Sequence, Any
+from typing import List, Optional, Callable, Sequence, Any, Dict, Type, Tuple
 
 import click
 
 
-@click.group()
+@click.group(
+    context_settings={"help_option_names": ["-h", "--help"], "max_content_width": 120}
+)
 @click.option(
     "--verbose/--quiet",
     default=None,
@@ -30,12 +32,24 @@ def main(verbose: Optional[bool]) -> None:
             log.logger = log.setup(level=logging.ERROR)
 
 
+# use the union of types to determine the possible filters
+from .models import DEFAULT_MODEL_TYPE, get_union_args
+
+model_types: Optional[Tuple[Type[DEFAULT_MODEL_TYPE]]] = get_union_args(
+    DEFAULT_MODEL_TYPE
+)
+assert model_types is not None
+
+FILTER_OPTIONS: Dict[str, Type[DEFAULT_MODEL_TYPE]] = {
+    t.__name__: t for t in model_types
+}
+
 SHARED = [
     click.option("--cache/--no-cache", default=False, show_default=True),
     click.option(
         "-a",
         "--action",
-        type=click.Choice(["repl", "summary", "json"]),
+        type=click.Choice(["repl", "summary", "json"], case_sensitive=False),
         default="repl",
         help="What to do with the parsed result",
         show_default=True,
@@ -46,8 +60,16 @@ def main(verbose: Optional[bool]) -> None:
         type=click.Choice(["en", "de"]),
         default="en",
         help="Used DEFAULT_HANDLER_MAP resoling folder names to parser models",
-        show_default=True
-    )
+        show_default=True,
+    ),
+    click.option(
+        "-f",
+        "--filter",
+        "filter_",
+        type=click.Choice(list(FILTER_OPTIONS.keys()), case_sensitive=False),
+        multiple=False,
+        help="Filter to only show events of this type",
+    ),
 ]
 
 
@@ -91,41 +113,64 @@ def _handle_action(res: List[Any], action: str) -> None:
 @main.command(short_help="parse a takeout directory")
 @shared_options
 @click.argument("TAKEOUT_DIR", type=click.Path(exists=True), required=True)
-def parse(cache: bool, action: str, locale: str, takeout_dir: str) -> None:
+def parse(
+    cache: bool, locale: str, action: str, takeout_dir: str, filter_: str
+) -> None:
     """
     Parse a takeout directory takeout
     """
     from .common import Res
     from .models import BaseEvent
-    from .path_dispatch import TakeoutParser, LocalizedHandler
+    from .path_dispatch import TakeoutParser
+    from .log import logger
 
     tp = TakeoutParser(
         takeout_dir,
         error_policy="drop",
         # None if no handler found, in this case TakeoutParser defaults
-        handlers=LocalizedHandler.handler_from_string(locale) 
+        handlers=LocalizedHandler.handler_from_string(locale),
     )
     # note: actually no exceptions since since they're dropped
-    res: List[Res[BaseEvent]] = list(tp.parse(cache=cache))
+    if cache:
+        if filter_:
+            logger.warn(
+                "As it would otherwise re-compute every time, filtering happens after loading from cache"
+            )
+        res = list(tp.parse(cache=True))
+        if filter_:
+            filter_type = FILTER_OPTIONS[filter_]
+            res = [r for r in res if isinstance(r, filter_type)]
+    else:
+        res = list(tp.parse(cache=False, filter_type=FILTER_OPTIONS.get(filter_, None)))
     _handle_action(res, action)
 
 
 @main.command(short_help="merge multiple takeout directories")
 @shared_options
 @click.argument("TAKEOUT_DIR", type=click.Path(exists=True), nargs=-1, required=True)
-def merge(cache: bool, action: str, takeout_dir: Sequence[str]) -> None:
+def merge(cache: bool, action: str, takeout_dir: Sequence[str], filter_: str) -> None:
     """
     Parse and merge multiple takeout directories
     """
     from .path_dispatch import TakeoutParser
     from .merge import cached_merge_takeouts, merge_events
-    from .models import DEFAULT_MODEL_TYPE
+    from .models import DEFAULT_MODEL_TYPE, Res
+    from .log import logger
 
-    res: List[DEFAULT_MODEL_TYPE] = []
+    res: List[Res[DEFAULT_MODEL_TYPE]] = []
+    filter_type: Optional[Type[DEFAULT_MODEL_TYPE]]
     if cache:
+        if filter_:
+            logger.warn(
+                "As it would otherwise re-compute every time, filtering happens after loading from cache"
+            )
         res = list(cached_merge_takeouts(list(takeout_dir)))
+        if filter_:
+            filter_type = FILTER_OPTIONS[filter_]
+            res = [r for r in res if isinstance(r, filter_type)]
     else:
-        res = list(merge_events(*iter([TakeoutParser(p).parse(cache=False) for p in takeout_dir])))  # type: ignore[arg-type]
+        filter_type = FILTER_OPTIONS[filter_] if filter_ else None
+        res = list(merge_events(*iter([TakeoutParser(p).parse(cache=False, filter_type=filter_type) for p in takeout_dir])))  # type: ignore[arg-type]
     _handle_action(res, action)
 
 

diff --git a/google_takeout_parser/compat.py b/google_takeout_parser/compat.py