From b1a06283824fdeb81fa8ae434275db9d224624dd Mon Sep 17 00:00:00 2001 From: Dima Gerasimov Date: Wed, 20 Apr 2022 21:58:10 +0100 Subject: [PATCH] my.youtube: use new my.google.takeout.parser module for its data - fallback on the old logic if google_takeout_parser isn't available - move to my.youtube.takeout (possibly mixing in other sources later) - keep my.media.youtube, but issue deprecation warning currently used in orger etc, so doesn't hurt to keep - also fixes https://github.com/karlicoss/HPI/issues/113 --- my/core/compat.py | 7 +++ my/media/__init__.py | 0 my/media/youtube.py | 46 ++-------------- my/youtube/takeout.py | 120 ++++++++++++++++++++++++++++++++++++++++++ tests/youtube.py | 30 ++++++++--- 5 files changed, 153 insertions(+), 50 deletions(-) delete mode 100644 my/media/__init__.py mode change 100755 => 100644 my/media/youtube.py create mode 100755 my/youtube/takeout.py diff --git a/my/core/compat.py b/my/core/compat.py index a4175b68..4dc8865b 100644 --- a/my/core/compat.py +++ b/my/core/compat.py @@ -83,3 +83,10 @@ def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwa dest.cursor().executescript(tempfile.read()) dest.commit() + + +# can remove after python3.9 +def removeprefix(text: str, prefix: str) -> str: + if text.startswith(prefix): + return text[len(prefix):] + return text diff --git a/my/media/__init__.py b/my/media/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/my/media/youtube.py b/my/media/youtube.py old mode 100755 new mode 100644 index 8212f12f..efaa74b0 --- a/my/media/youtube.py +++ b/my/media/youtube.py @@ -1,43 +1,5 @@ -#!/usr/bin/env python3 -from datetime import datetime -from typing import NamedTuple, List, Iterable - -from ..google.takeout.html import read_html -from ..google.takeout.paths import get_last_takeout - - -class Watched(NamedTuple): - url: str - title: str - when: datetime - - @property - def eid(self) -> str: - return f'{self.url}-{self.when.isoformat()}' - - -def watched() -> Iterable[Watched]: - # TODO need to use a glob? to make up for old takouts that didn't start with Takeout/ - path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last - # TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json - last = get_last_takeout(path=path) - if last is None: - return [] - - - watches: List[Watched] = [] - for dt, url, title in read_html(last, path): - watches.append(Watched(url=url, title=title, when=dt)) - - # TODO hmm they already come sorted.. wonder if should just rely on it.. - return list(sorted(watches, key=lambda e: e.when)) - - -from ..core import stat, Stats -def stats() -> Stats: - return stat(watched) - - -# todo deprecate -get_watched = watched +from ..core.warnings import high +high("DEPRECATED! Please use my.youtube.takeout instead.") +from ..core.util import __NOT_HPI_MODULE__ +from ..youtube.takeout import * diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py new file mode 100755 index 00000000..3d284b68 --- /dev/null +++ b/my/youtube/takeout.py @@ -0,0 +1,120 @@ +from typing import NamedTuple, List, Iterable + +from ..core import datetime_aware, Res, LazyLogger +from ..core.compat import removeprefix + + +logger = LazyLogger(__name__) + + +class Watched(NamedTuple): + url: str + title: str + when: datetime_aware + + @property + def eid(self) -> str: + return f'{self.url}-{self.when.isoformat()}' + + +# todo define error policy? +# although it has one from google takeout module.. so not sure + +def watched() -> Iterable[Res[Watched]]: + try: + from ..google.takeout.parser import events + from google_takeout_parser.models import Activity + except ModuleNotFoundError as ex: + logger.exception(ex) + from ..core.warnings import high + high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.") + yield from _watched_legacy() + return + + YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v=' + + # TODO would be nice to filter, e.g. it's kinda pointless to process Location events + for e in events(): + if isinstance(e, Exception): + yield e + + if not isinstance(e, Activity): + continue + + url = e.titleUrl + header = e.header + title = e.title + + if url is None: + continue + + if header in {'Image Search', 'Search', 'Chrome'}: + # sometimes results in youtube links.. but definitely not watch history + continue + + if header not in {'YouTube', 'youtube.com'}: + # TODO hmm -- wonder if these would end up in dupes in takeout? would be nice to check + # perhaps this would be easier once we have universal ids + if YOUTUBE_VIDEO_LINK in url: + # TODO maybe log in this case or something? + pass + continue + + if header == 'youtube.com' and title.startswith('Visited '): + continue + + if title.startswith('Searched for') and url.startswith('https://www.youtube.com/results'): + # search activity, don't need it here + continue + + if title.startswith('Subscribed to') and url.startswith('https://www.youtube.com/channel/'): + # todo might be interesting to process somwhere? + continue + + # all titles contain it, so pointless to include 'Watched ' + # also compatible with legacy titles + title = removeprefix(title, 'Watched ') + + if YOUTUBE_VIDEO_LINK not in url: + if e.details == ['From Google Ads']: + # weird, sometimes results in odd + continue + if title == 'Used YouTube' and e.products == ['Android']: + continue + + yield RuntimeError(f'Unexpected url: {e}') + continue + + yield Watched( + url=url, + title=title, + when=e.time, + ) + + +from ..core import stat, Stats +def stats() -> Stats: + return stat(watched) + + +### deprecated stuff (keep in my.media.youtube) + +get_watched = watched + + +def _watched_legacy() -> Iterable[Watched]: + from ..google.takeout.html import read_html + from ..google.takeout.paths import get_last_takeout + + # todo looks like this one doesn't have retention? so enough to use the last + path = 'Takeout/My Activity/YouTube/MyActivity.html' + last = get_last_takeout(path=path) + if last is None: + return [] + + watches: List[Watched] = [] + for dt, url, title in read_html(last, path): + watches.append(Watched(url=url, title=title, when=dt)) + + # todo hmm they already come sorted.. wonder if should just rely on it.. + return list(sorted(watches, key=lambda e: e.when)) diff --git a/tests/youtube.py b/tests/youtube.py index d514061a..4864ee92 100644 --- a/tests/youtube.py +++ b/tests/youtube.py @@ -1,22 +1,36 @@ # TODO move elsewhere? # these tests would only make sense with some existing data? although some of them would work for everyone.. # not sure what's a good way of handling this.. +from datetime import datetime +import pytz +from more_itertools import bucket + + from .common import skip_if_not_karlicoss as pytestmark # TODO ugh. if i uncomment this here (on top level), then this test vvv fails # from my.media.youtube import get_watched, Watched # HPI_TESTS_KARLICOSS=true pytest -raps tests/tz.py tests/youtube.py + def test() -> None: - from my.media.youtube import get_watched, Watched - watched = list(get_watched()) - assert len(watched) > 1000 + from my.youtube.takeout import watched, Watched + videos = [w for w in watched() if not isinstance(w, Exception)] + assert len(videos) > 1000 - from datetime import datetime - import pytz - w = Watched( + # results in nicer errors, otherwise annoying to check against thousands of videos + grouped = bucket(videos, key=lambda w: (w.url, w.title)) + + w1 = Watched( url='https://www.youtube.com/watch?v=hTGJfRPLe08', title='Jamie xx - Gosh', - when=datetime(year=2018, month=6, day=21, hour=5, minute=48, second=34, tzinfo=pytz.utc), + when=pytz.timezone('Europe/London').localize(datetime(year=2018, month=6, day=21, hour=6, minute=48, second=34)), + ) + assert w1 in list(grouped[(w1.url, w1.title)]) + + w2 = Watched( + url='https://www.youtube.com/watch?v=IZ_8b_Ydsv0', + title='Why LESS Sensitive Tests Might Be Better', + when=pytz.utc.localize(datetime(year=2021, month=1, day=15, hour=17, minute=54, second=12)), ) - assert w in watched + assert w2 in list(grouped[(w2.url, w2.title)])