Skip to content

Commit

Permalink
add csv youtube/live chat parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
purarue committed Feb 11, 2024
1 parent 78a2c5d commit 372f002
Show file tree
Hide file tree
Showing 7 changed files with 233 additions and 1 deletion.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,11 @@ This currently parses:
- Location History `Location History/Location History.json`, `Location History/Records.json`
- Youtube:
- History - `YouTube and YouTube Music/history/*.html|*.json`
- Comments - `YouTube and YouTube Music/my-comments/*.html`
- Comments
- Legacy HTML Comment format: `YouTube and YouTube Music/my-comments/*.html`
- New CSV/JSON format (mostly CSV, but the comment contents itself are a JSON blob):
- `Youtube/comments/comments.csv`
- `Youtube/live chats/live chats.csv`
- Live Chat Messages - `YouTube and YouTube Music/my-live-chat-messages/*.html`
- Likes: `YouTube and YouTube Music/playlists/likes.json`

Expand Down Expand Up @@ -218,6 +222,8 @@ Dont feel required to add support for all locales, its somewhat annoying to swap

Though, if your takeout is in some language this doesn't support, you can [create an issue](https://github.com/seanbreckenridge/google_takeout_parser/issues/new?title=support+new+locale) with the file structure (run `find Takeout` and/or `tree Takeout`), or contribute a locale file by creating a `path -> function mapping`, and adding it to the global `LOCALES` variables in `locales/all.py` and `locales/main.py`

This is a pretty difficult to maintain, as it requires a lot of manual testing from people who have access to these takeouts, and who actively use the language that the takeout is in. My google accounts main language is English, so I upkeep that locale whenever I notice changes, but its not trivial to port those changes to other locales without swapping my language, making an export, waiting, and then switching back. I keep track of mismatched changes [in this board](https://github.com/users/seanbreckenridge/projects/1/views/1)

Ideally, you would select everything when doing a takeout (not just the `My Activity`/`Chrome`/`Location History` like I suggested above), so [paths that are not parsed can be ignored properly](https://github.com/seanbreckenridge/google_takeout_parser/blob/4981c241c04b5b37265710dcc6ca00f19d1eafb4/google_takeout_parser/locales/en.py#L105C1-L113).

### Testing
Expand Down
4 changes: 4 additions & 0 deletions google_takeout_parser/locales/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
_parse_semantic_location_history,
_parse_chrome_history,
)
from ..parse_csv import ( # noqa: F401
_parse_youtube_comments_csv,
_parse_youtube_live_chats_csv,
)

BaseResults = Iterator[Res[BaseEvent]]

Expand Down
6 changes: 6 additions & 0 deletions google_takeout_parser/locales/en.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
_parse_html_comment_file,
_parse_json_activity,
_parse_likes,
_parse_youtube_comments_csv,
_parse_youtube_live_chats_csv,
_parse_app_installs,
_parse_location_history,
_parse_semantic_location_history,
Expand Down Expand Up @@ -48,18 +50,22 @@
r"Location History/Location( )?History.json": _parse_location_history, # old path to Location History
r"Location History/Records.json": _parse_location_history, # new path to Location History
r"Location History/Settings.json": None,
r"Location History \(Timeline\)/Settings.json": None,
# HTML/JSON activity-like files which aren't in 'My Activity'
# optional " and Youtube Music" to handle pre-2017 data
r"YouTube( and YouTube Music)?/history/.*?.html": _parse_html_activity,
r"YouTube( and YouTube Music)?/history/.*?.json": _parse_json_activity,
# basic list item files which have chat messages/comments
r"YouTube( and YouTube Music)?/my-comments/.*?.html": _parse_html_comment_file,
r"YouTube( and YouTube Music)?/comments/comments.csv": _parse_youtube_comments_csv,
r"YouTube( and YouTube Music)?/live\s*chats/live\s*chats.csv": _parse_youtube_live_chats_csv,
r"YouTube( and YouTube Music)?/my-live-chat-messages/.*?.html": _parse_html_comment_file,
r"YouTube( and YouTube Music)?/playlists/likes.json": _parse_likes,
r"YouTube( and YouTube Music)?/playlists/": None,
r"YouTube( and YouTube Music)?/subscriptions": None,
r"YouTube( and YouTube Music)?/videos": None,
r"YouTube( and YouTube Music)?/music-uploads": None,
r"YouTube( and YouTube Music)?/channels/": None,
r"My Activity/Assistant/.*.mp3": None, # might be interesting to extract timestamps
r"My Activity/Voice and Audio/.*.mp3": None,
r"My Activity/Takeout": None, # activity for when you made takeouts, dont need
Expand Down
45 changes: 45 additions & 0 deletions google_takeout_parser/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,12 @@ class LocationInfo(NamedTuple):
sourceUrl: Optional[Url]


# fmt: off
class BaseEvent(Protocol):
@property
def key(self) -> Any:
...
# fmt: on


@dataclass
Expand Down Expand Up @@ -84,6 +86,10 @@ def key(self) -> Tuple[str, str, int]:

@dataclass
class YoutubeComment(BaseEvent):
"""
NOTE: this was the old format, the takeout.google.com returns a CSV file now instead, which is the model CSVYoutubeComment below
"""

content: str
dt: datetime
urls: List[Url]
Expand All @@ -93,6 +99,43 @@ def key(self) -> int:
return int(self.dt.timestamp())


@dataclass
class CSVYoutubeComment(BaseEvent):
commentId: str
channelId: str
dt: datetime
price: Optional[str]
parentCommentId: Optional[str]
videoId: str
contentJSON: str

@property
def key(self) -> int:
return int(self.dt.timestamp())


# considered re-using model above, but might be confusing
# and its useful to know if a message was from a livestream
# or a VOD
@dataclass
class CSVYoutubeLiveChat(BaseEvent):
"""
this is very similar to CSVYoutubeComment, but chatId instead of commentId
and it cant have a parentCommentId
"""

liveChatId: str
channelId: str
dt: datetime
price: Optional[str]
videoId: str
contentJSON: str

@property
def key(self) -> int:
return int(self.dt.timestamp())


@dataclass
class LikedYoutubeVideo(BaseEvent):
title: str
Expand Down Expand Up @@ -203,6 +246,8 @@ def key(self) -> Tuple[str, int]:
Location,
ChromeHistory,
YoutubeComment,
CSVYoutubeComment,
CSVYoutubeLiveChat,
PlaceVisit,
]

Expand Down
111 changes: 111 additions & 0 deletions google_takeout_parser/parse_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import csv
from pathlib import Path
from typing import List, TextIO, Iterator

from .models import CSVYoutubeComment, CSVYoutubeLiveChat
from .common import Res
from .time_utils import parse_json_utc_date


def _parse_youtube_comment_row(row: List[str]) -> Res[CSVYoutubeComment]:
# Comment ID,Channel ID,Comment Create Timestamp,Price,Parent Comment ID,Video ID,Comment Text
try:
(
comment_id,
channel_id,
created_at,
price,
parent_comment_id,
video_id,
textJSON,
) = row
except ValueError as e:
return e
return CSVYoutubeComment(
commentId=comment_id,
channelId=channel_id,
dt=parse_json_utc_date(created_at),
price=price,
parentCommentId=parent_comment_id if parent_comment_id.strip() else None,
videoId=video_id,
# for now just pass the contents of the message as JSON forwards,
# will add helpers that let the user access it in different ways programatically
# instead of trying to define every access pattern in a model
contentJSON=textJSON,
)


def is_empty_row(row: List[str]) -> bool:
if len(row) == 0:
return True
for item in row:
if item.strip():
return False
return True


def _parse_youtube_comments_buffer(
buf: TextIO,
skip_first: bool = True,
) -> Iterator[Res[CSVYoutubeComment]]:
reader = csv.reader(buf)
if skip_first:
next(reader)
for row in reader:
if is_empty_row(row):
continue
if len(row) != 7:
yield ValueError(f"Expected 7 columns, got {len(row)}: {row}")
continue
yield _parse_youtube_comment_row(row)


def _parse_youtube_comments_csv(path: Path) -> Iterator[Res[CSVYoutubeComment]]:
with path.open("r", newline="") as f:
yield from _parse_youtube_comments_buffer(f)


# Live Chat ID,Channel ID,Live Chat Create Timestamp,Price,Video ID,Live Chat Text


def _parse_youtube_live_chat_row(row: List[str]) -> Res[CSVYoutubeLiveChat]:
try:
(
live_chat_id,
channel_id,
created_at,
price,
video_id,
textJSON,
) = row
except ValueError as e:
return e
return CSVYoutubeLiveChat(
liveChatId=live_chat_id,
channelId=channel_id,
dt=parse_json_utc_date(created_at),
price=price,
videoId=video_id,
contentJSON=textJSON,
)


def _parse_youtube_live_chats_buffer(
buf: TextIO,
skip_first: bool = True,
) -> Iterator[Res[CSVYoutubeLiveChat]]:
reader = csv.reader(buf)
if skip_first:
next(reader)
for row in reader:
if is_empty_row(row):
continue
if len(row) != 6:
yield ValueError(f"Expected 6 columns, got {len(row)}: {row}")
continue
yield _parse_youtube_live_chat_row(row)


def _parse_youtube_live_chats_csv(path: Path) -> Iterator[Res[CSVYoutubeLiveChat]]:
with path.open("r", newline="") as f:
yield from _parse_youtube_live_chats_buffer(f)
Empty file.
60 changes: 60 additions & 0 deletions tests/test_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from io import StringIO
from datetime import datetime, timezone
from google_takeout_parser.parse_csv import (
_parse_youtube_comments_buffer,
_parse_youtube_live_chats_buffer,
)
from google_takeout_parser.models import CSVYoutubeComment, CSVYoutubeLiveChat


def test_parse_youtube_comment_buffer() -> None:
text_content = """UgxtiXQkY7gqHbldJ1F4AaABAg,UCA6DtnbZ2KJckyTYfXOwQNA,2023-09-19T17:42:53.434647+00:00,0,,WtOskFeLmr4,"{""takeoutSegments"":[{""text"":""coalowl the legend""}]}"
UgwDN8UeMxW4NDFbvY54AaABAg.9iwJkUYNcXa9u0lv3j3Abh,UCA6DtnbZ2KJckyTYfXOwQNA,2023-08-30T01:54:46.801024+00:00,0,UgwDN8UeMxW4NDFbvY54AaABAg,jH39c5-y6kg,"{""takeoutSegments"":[{""text"":""Ah, this is the reason why Ive never seen concurrent write failures myself, python's default timeout value is 5s, so it just waits in a busy loop if I have 'concurrent writers'""}]}"
"""

buf = StringIO(text_content)

res = list(_parse_youtube_comments_buffer(buf, skip_first=False))
assert len(res) == 2

assert res[0] == CSVYoutubeComment(
commentId="UgxtiXQkY7gqHbldJ1F4AaABAg",
channelId="UCA6DtnbZ2KJckyTYfXOwQNA",
dt=datetime(2023, 9, 19, 17, 42, 53, 434647, tzinfo=timezone.utc),
price="0",
parentCommentId=None,
videoId="WtOskFeLmr4",
contentJSON='{"takeoutSegments":[{"text":"coalowl the legend"}]}',
)

assert res[1] == CSVYoutubeComment(
commentId="UgwDN8UeMxW4NDFbvY54AaABAg.9iwJkUYNcXa9u0lv3j3Abh",
channelId="UCA6DtnbZ2KJckyTYfXOwQNA",
dt=datetime(2023, 8, 30, 1, 54, 46, 801024, tzinfo=timezone.utc),
price="0",
parentCommentId="UgwDN8UeMxW4NDFbvY54AaABAg",
videoId="jH39c5-y6kg",
contentJSON='{"takeoutSegments":[{"text":"Ah, this is the reason why Ive never seen concurrent write failures myself, python\'s default timeout value is 5s, so it just waits in a busy loop if I have \'concurrent writers\'"}]}',
)


def test_parse_youtube_live_chat_buffer() -> None:
text_content = """UgwsSD8yrDW7_h6F5vZ4AaABDqgB5OC1kgI,UCA6DtnbZ2KJckyTYfXOwQNA,2018-09-02T05:16:35.510381+00:00,0,0vGCh85obuI,"{""takeoutSegments"":[{""text"":""\""think the right thing\"" jeez""}]}"
"""

buf = StringIO(text_content)

res = list(_parse_youtube_live_chats_buffer(buf, skip_first=False))
assert len(res) == 1

assert res[0] == CSVYoutubeLiveChat(
liveChatId="UgwsSD8yrDW7_h6F5vZ4AaABDqgB5OC1kgI",
channelId="UCA6DtnbZ2KJckyTYfXOwQNA",
dt=datetime(2018, 9, 2, 5, 16, 35, 510381, tzinfo=timezone.utc),
price="0",
videoId="0vGCh85obuI",
contentJSON='{"takeoutSegments":[{"text":"\"think the right thing\" jeez"}]}',
)

0 comments on commit 372f002

Please sign in to comment.