-
Notifications
You must be signed in to change notification settings - Fork 60
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
my.youtube: use new my.google.takeout.parser module for its data
- fallback on the old logic if google_takeout_parser isn't available - move to my.youtube.takeout (possibly mixing in other sources later) - keep my.media.youtube, but issue deprecation warning currently used in orger etc, so doesn't hurt to keep - also fixes #113
- Loading branch information
Showing
5 changed files
with
153 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,43 +1,5 @@ | ||
#!/usr/bin/env python3 | ||
from datetime import datetime | ||
from typing import NamedTuple, List, Iterable | ||
|
||
from ..google.takeout.html import read_html | ||
from ..google.takeout.paths import get_last_takeout | ||
|
||
|
||
class Watched(NamedTuple): | ||
url: str | ||
title: str | ||
when: datetime | ||
|
||
@property | ||
def eid(self) -> str: | ||
return f'{self.url}-{self.when.isoformat()}' | ||
|
||
|
||
def watched() -> Iterable[Watched]: | ||
# TODO need to use a glob? to make up for old takouts that didn't start with Takeout/ | ||
path = 'Takeout/My Activity/YouTube/MyActivity.html' # looks like this one doesn't have retention? so enough to use the last | ||
# TODO YouTube/history/watch-history.html, also YouTube/history/watch-history.json | ||
last = get_last_takeout(path=path) | ||
if last is None: | ||
return [] | ||
|
||
|
||
watches: List[Watched] = [] | ||
for dt, url, title in read_html(last, path): | ||
watches.append(Watched(url=url, title=title, when=dt)) | ||
|
||
# TODO hmm they already come sorted.. wonder if should just rely on it.. | ||
return list(sorted(watches, key=lambda e: e.when)) | ||
|
||
|
||
from ..core import stat, Stats | ||
def stats() -> Stats: | ||
return stat(watched) | ||
|
||
|
||
# todo deprecate | ||
get_watched = watched | ||
from ..core.warnings import high | ||
high("DEPRECATED! Please use my.youtube.takeout instead.") | ||
from ..core.util import __NOT_HPI_MODULE__ | ||
|
||
from ..youtube.takeout import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
from typing import NamedTuple, List, Iterable | ||
|
||
from ..core import datetime_aware, Res, LazyLogger | ||
from ..core.compat import removeprefix | ||
|
||
|
||
logger = LazyLogger(__name__) | ||
|
||
|
||
class Watched(NamedTuple): | ||
url: str | ||
title: str | ||
when: datetime_aware | ||
|
||
@property | ||
def eid(self) -> str: | ||
return f'{self.url}-{self.when.isoformat()}' | ||
|
||
|
||
# todo define error policy? | ||
# although it has one from google takeout module.. so not sure | ||
|
||
def watched() -> Iterable[Res[Watched]]: | ||
try: | ||
from ..google.takeout.parser import events | ||
from google_takeout_parser.models import Activity | ||
except ModuleNotFoundError as ex: | ||
logger.exception(ex) | ||
from ..core.warnings import high | ||
high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.") | ||
yield from _watched_legacy() | ||
return | ||
|
||
YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v=' | ||
|
||
# TODO would be nice to filter, e.g. it's kinda pointless to process Location events | ||
for e in events(): | ||
if isinstance(e, Exception): | ||
yield e | ||
|
||
if not isinstance(e, Activity): | ||
continue | ||
|
||
url = e.titleUrl | ||
header = e.header | ||
title = e.title | ||
|
||
if url is None: | ||
continue | ||
|
||
if header in {'Image Search', 'Search', 'Chrome'}: | ||
# sometimes results in youtube links.. but definitely not watch history | ||
continue | ||
|
||
if header not in {'YouTube', 'youtube.com'}: | ||
# TODO hmm -- wonder if these would end up in dupes in takeout? would be nice to check | ||
# perhaps this would be easier once we have universal ids | ||
if YOUTUBE_VIDEO_LINK in url: | ||
# TODO maybe log in this case or something? | ||
pass | ||
continue | ||
|
||
if header == 'youtube.com' and title.startswith('Visited '): | ||
continue | ||
|
||
if title.startswith('Searched for') and url.startswith('https://www.youtube.com/results'): | ||
# search activity, don't need it here | ||
continue | ||
|
||
if title.startswith('Subscribed to') and url.startswith('https://www.youtube.com/channel/'): | ||
# todo might be interesting to process somwhere? | ||
continue | ||
|
||
# all titles contain it, so pointless to include 'Watched ' | ||
# also compatible with legacy titles | ||
title = removeprefix(title, 'Watched ') | ||
|
||
if YOUTUBE_VIDEO_LINK not in url: | ||
if e.details == ['From Google Ads']: | ||
# weird, sometimes results in odd | ||
continue | ||
if title == 'Used YouTube' and e.products == ['Android']: | ||
continue | ||
|
||
yield RuntimeError(f'Unexpected url: {e}') | ||
continue | ||
|
||
yield Watched( | ||
url=url, | ||
title=title, | ||
when=e.time, | ||
) | ||
|
||
|
||
from ..core import stat, Stats | ||
def stats() -> Stats: | ||
return stat(watched) | ||
|
||
|
||
### deprecated stuff (keep in my.media.youtube) | ||
|
||
get_watched = watched | ||
|
||
|
||
def _watched_legacy() -> Iterable[Watched]: | ||
from ..google.takeout.html import read_html | ||
from ..google.takeout.paths import get_last_takeout | ||
|
||
# todo looks like this one doesn't have retention? so enough to use the last | ||
path = 'Takeout/My Activity/YouTube/MyActivity.html' | ||
last = get_last_takeout(path=path) | ||
if last is None: | ||
return [] | ||
|
||
watches: List[Watched] = [] | ||
for dt, url, title in read_html(last, path): | ||
watches.append(Watched(url=url, title=title, when=dt)) | ||
|
||
# todo hmm they already come sorted.. wonder if should just rely on it.. | ||
return list(sorted(watches, key=lambda e: e.when)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,36 @@ | ||
# TODO move elsewhere? | ||
# these tests would only make sense with some existing data? although some of them would work for everyone.. | ||
# not sure what's a good way of handling this.. | ||
from datetime import datetime | ||
import pytz | ||
from more_itertools import bucket | ||
|
||
|
||
from .common import skip_if_not_karlicoss as pytestmark | ||
|
||
# TODO ugh. if i uncomment this here (on top level), then this test vvv fails | ||
# from my.media.youtube import get_watched, Watched | ||
# HPI_TESTS_KARLICOSS=true pytest -raps tests/tz.py tests/youtube.py | ||
|
||
|
||
def test() -> None: | ||
from my.media.youtube import get_watched, Watched | ||
watched = list(get_watched()) | ||
assert len(watched) > 1000 | ||
from my.youtube.takeout import watched, Watched | ||
videos = [w for w in watched() if not isinstance(w, Exception)] | ||
assert len(videos) > 1000 | ||
|
||
from datetime import datetime | ||
import pytz | ||
w = Watched( | ||
# results in nicer errors, otherwise annoying to check against thousands of videos | ||
grouped = bucket(videos, key=lambda w: (w.url, w.title)) | ||
|
||
w1 = Watched( | ||
url='https://www.youtube.com/watch?v=hTGJfRPLe08', | ||
title='Jamie xx - Gosh', | ||
when=datetime(year=2018, month=6, day=21, hour=5, minute=48, second=34, tzinfo=pytz.utc), | ||
when=pytz.timezone('Europe/London').localize(datetime(year=2018, month=6, day=21, hour=6, minute=48, second=34)), | ||
) | ||
assert w1 in list(grouped[(w1.url, w1.title)]) | ||
|
||
w2 = Watched( | ||
url='https://www.youtube.com/watch?v=IZ_8b_Ydsv0', | ||
title='Why LESS Sensitive Tests Might Be Better', | ||
when=pytz.utc.localize(datetime(year=2021, month=1, day=15, hour=17, minute=54, second=12)), | ||
) | ||
assert w in watched | ||
assert w2 in list(grouped[(w2.url, w2.title)]) |