From b1a06283824fdeb81fa8ae434275db9d224624dd Mon Sep 17 00:00:00 2001
From: Dima Gerasimov <karlicoss@gmail.com>
Date: Wed, 20 Apr 2022 21:58:10 +0100
Subject: [PATCH] my.youtube: use new my.google.takeout.parser module for its
 data

- fallback on the old logic if google_takeout_parser isn't available
- move to my.youtube.takeout (possibly mixing in other sources later)
- keep my.media.youtube, but issue deprecation warning
  currently used in orger etc, so doesn't hurt to keep
- also fixes https://github.com/karlicoss/HPI/issues/113
---
 my/core/compat.py     |   7 +++
 my/media/__init__.py  |   0
 my/media/youtube.py   |  46 ++--------------
 my/youtube/takeout.py | 120 ++++++++++++++++++++++++++++++++++++++++++
 tests/youtube.py      |  30 ++++++++---
 5 files changed, 153 insertions(+), 50 deletions(-)
 delete mode 100644 my/media/__init__.py
 mode change 100755 => 100644 my/media/youtube.py
 create mode 100755 my/youtube/takeout.py

diff --git a/my/core/compat.py b/my/core/compat.py
index a4175b68..4dc8865b 100644
--- a/my/core/compat.py
+++ b/my/core/compat.py
@@ -83,3 +83,10 @@ def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwa
 
         dest.cursor().executescript(tempfile.read())
         dest.commit()
+
+
+# can remove after python3.9
+def removeprefix(text: str, prefix: str) -> str:
+    if text.startswith(prefix):
+        return text[len(prefix):]
+    return text
diff --git a/my/media/__init__.py b/my/media/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/my/media/youtube.py b/my/media/youtube.py
old mode 100755
new mode 100644
index 8212f12f..efaa74b0
--- a/my/media/youtube.py
+++ b/my/media/youtube.py
@@ -1,43 +1,5 @@
-#!/usr/bin/env python3
-from datetime import datetime
-from typing import NamedTuple, List, Iterable
-
-from ..google.takeout.html import read_html
-from ..google.takeout.paths import get_last_takeout
-
-
-class Watched(NamedTuple):
-    url: str
-    title: str
-    when: datetime
-
-    @property
-    def eid(self) -> str:
-        return f'{self.url}-{self.when.isoformat()}'
-
-
-def watched() -> Iterable[Watched]:
-    # TODO need to use a glob? to make up for old takouts that didn't start with Takeout/
-    path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last
-    # TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json
-    last = get_last_takeout(path=path)
-    if last is None:
-        return []
-
-
-    watches: List[Watched] = []
-    for dt, url, title in read_html(last, path):
-        watches.append(Watched(url=url, title=title, when=dt))
-
-    # TODO hmm they already come sorted.. wonder if should just rely on it..
-    return list(sorted(watches, key=lambda e: e.when))
-
-
-from ..core import stat, Stats
-def stats() -> Stats:
-    return stat(watched)
-
-
-# todo deprecate
-get_watched = watched
+from ..core.warnings import high
+high("DEPRECATED! Please use my.youtube.takeout instead.")
+from ..core.util import __NOT_HPI_MODULE__
 
+from ..youtube.takeout import *
diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py
new file mode 100755
index 00000000..3d284b68
--- /dev/null
+++ b/my/youtube/takeout.py
@@ -0,0 +1,120 @@
+from typing import NamedTuple, List, Iterable
+
+from ..core import datetime_aware, Res, LazyLogger
+from ..core.compat import removeprefix
+
+
+logger = LazyLogger(__name__)
+
+
+class Watched(NamedTuple):
+    url: str
+    title: str
+    when: datetime_aware
+
+    @property
+    def eid(self) -> str:
+        return f'{self.url}-{self.when.isoformat()}'
+
+
+# todo define error policy?
+# although it has one from google takeout module.. so not sure
+
+def watched() -> Iterable[Res[Watched]]:
+    try:
+        from ..google.takeout.parser import events
+        from google_takeout_parser.models import Activity
+    except ModuleNotFoundError as ex:
+        logger.exception(ex)
+        from ..core.warnings import high
+        high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
+        yield from _watched_legacy()
+        return
+
+    YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v='
+
+    # TODO would be nice to filter, e.g. it's kinda pointless to process Location events
+    for e in events():
+        if isinstance(e, Exception):
+            yield e
+
+        if not isinstance(e, Activity):
+            continue
+
+        url = e.titleUrl
+        header = e.header
+        title = e.title
+
+        if url is None:
+            continue
+
+        if header in {'Image Search', 'Search', 'Chrome'}:
+            # sometimes results in youtube links.. but definitely not watch history
+            continue
+
+        if header not in {'YouTube', 'youtube.com'}:
+            # TODO hmm -- wonder if these would end up in dupes in takeout? would be nice to check
+            # perhaps this would be easier once we have universal ids
+            if YOUTUBE_VIDEO_LINK in url:
+                # TODO maybe log in this case or something?
+                pass
+            continue
+
+        if header == 'youtube.com' and title.startswith('Visited '):
+            continue
+
+        if title.startswith('Searched for') and url.startswith('https://www.youtube.com/results'):
+            # search activity, don't need it here
+            continue
+
+        if title.startswith('Subscribed to') and url.startswith('https://www.youtube.com/channel/'):
+            # todo might be interesting to process somwhere?
+            continue
+
+        # all titles contain it, so pointless to include 'Watched '
+        # also compatible with legacy titles
+        title = removeprefix(title, 'Watched ')
+
+        if YOUTUBE_VIDEO_LINK not in url:
+            if e.details == ['From Google Ads']:
+                # weird, sometimes results in odd
+                continue
+            if title == 'Used YouTube' and e.products == ['Android']:
+                continue
+
+            yield RuntimeError(f'Unexpected url: {e}')
+            continue
+
+        yield Watched(
+            url=url,
+            title=title,
+            when=e.time,
+        )
+
+
+from ..core import stat, Stats
+def stats() -> Stats:
+    return stat(watched)
+
+
+### deprecated stuff (keep in my.media.youtube)
+
+get_watched = watched
+
+
+def _watched_legacy() -> Iterable[Watched]:
+    from ..google.takeout.html import read_html
+    from ..google.takeout.paths import get_last_takeout
+
+    # todo looks like this one doesn't have retention? so enough to use the last
+    path = 'Takeout/My Activity/YouTube/MyActivity.html'
+    last = get_last_takeout(path=path)
+    if last is None:
+        return []
+
+    watches: List[Watched] = []
+    for dt, url, title in read_html(last, path):
+        watches.append(Watched(url=url, title=title, when=dt))
+
+    # todo hmm they already come sorted.. wonder if should just rely on it..
+    return list(sorted(watches, key=lambda e: e.when))
diff --git a/tests/youtube.py b/tests/youtube.py
index d514061a..4864ee92 100644
--- a/tests/youtube.py
+++ b/tests/youtube.py
@@ -1,22 +1,36 @@
 # TODO move elsewhere?
 # these tests would only make sense with some existing data? although some of them would work for everyone..
 # not sure what's a good way of handling this..
+from datetime import datetime
+import pytz
+from more_itertools import bucket
+
+
 from .common import skip_if_not_karlicoss as pytestmark
 
 # TODO ugh. if i uncomment this here (on top level), then this test vvv fails
 # from my.media.youtube import get_watched, Watched
 # HPI_TESTS_KARLICOSS=true pytest -raps tests/tz.py tests/youtube.py
 
+
 def test() -> None:
-    from my.media.youtube import get_watched, Watched
-    watched = list(get_watched())
-    assert len(watched) > 1000
+    from my.youtube.takeout import watched, Watched
+    videos = [w for w in watched() if not isinstance(w, Exception)]
+    assert len(videos) > 1000
 
-    from datetime import datetime
-    import pytz
-    w = Watched(
+    # results in nicer errors, otherwise annoying to check against thousands of videos
+    grouped = bucket(videos, key=lambda w: (w.url, w.title))
+
+    w1 = Watched(
         url='https://www.youtube.com/watch?v=hTGJfRPLe08',
         title='Jamie xx - Gosh',
-        when=datetime(year=2018, month=6, day=21, hour=5, minute=48, second=34, tzinfo=pytz.utc),
+        when=pytz.timezone('Europe/London').localize(datetime(year=2018, month=6, day=21, hour=6, minute=48, second=34)),
+    )
+    assert w1 in list(grouped[(w1.url, w1.title)])
+
+    w2 = Watched(
+        url='https://www.youtube.com/watch?v=IZ_8b_Ydsv0',
+        title='Why LESS Sensitive Tests Might Be Better',
+        when=pytz.utc.localize(datetime(year=2021, month=1, day=15, hour=17, minute=54, second=12)),
     )
-    assert w in watched
+    assert w2 in list(grouped[(w2.url, w2.title)])