add csv youtube/live chat parsing

purarue · Feb 11, 2024 · 372f002 · 372f002
1 parent 78a2c5d
commit 372f002
Show file tree

Hide file tree

Showing 7 changed files with 233 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -46,7 +46,11 @@ This currently parses:
   - Location History `Location History/Location History.json`, `Location History/Records.json`
 - Youtube:
   - History - `YouTube and YouTube Music/history/*.html|*.json`
-  - Comments - `YouTube and YouTube Music/my-comments/*.html`
+  - Comments
+    - Legacy HTML Comment format: `YouTube and YouTube Music/my-comments/*.html`
+    - New CSV/JSON format (mostly CSV, but the comment contents itself are a JSON blob):
+      - `Youtube/comments/comments.csv`
+      - `Youtube/live chats/live chats.csv`
   - Live Chat Messages - `YouTube and YouTube Music/my-live-chat-messages/*.html`
   - Likes: `YouTube and YouTube Music/playlists/likes.json`
 
@@ -218,6 +222,8 @@ Dont feel required to add support for all locales, its somewhat annoying to swap
 
 Though, if your takeout is in some language this doesn't support, you can [create an issue](https://github.com/seanbreckenridge/google_takeout_parser/issues/new?title=support+new+locale) with the file structure (run `find Takeout` and/or `tree Takeout`), or contribute a locale file by creating a `path -> function mapping`, and adding it to the global `LOCALES` variables in `locales/all.py` and `locales/main.py`
 
+This is a pretty difficult to maintain, as it requires a lot of manual testing from people who have access to these takeouts, and who actively use the language that the takeout is in. My google accounts main language is English, so I upkeep that locale whenever I notice changes, but its not trivial to port those changes to other locales without swapping my language, making an export, waiting, and then switching back. I keep track of mismatched changes [in this board](https://github.com/users/seanbreckenridge/projects/1/views/1)
+
 Ideally, you would select everything when doing a takeout (not just the `My Activity`/`Chrome`/`Location History` like I suggested above), so [paths that are not parsed can be ignored properly](https://github.com/seanbreckenridge/google_takeout_parser/blob/4981c241c04b5b37265710dcc6ca00f19d1eafb4/google_takeout_parser/locales/en.py#L105C1-L113).
 
 ### Testing

diff --git a/google_takeout_parser/locales/common.py b/google_takeout_parser/locales/common.py
@@ -12,6 +12,10 @@
     _parse_semantic_location_history,
     _parse_chrome_history,
 )
+from ..parse_csv import (  # noqa: F401
+    _parse_youtube_comments_csv,
+    _parse_youtube_live_chats_csv,
+)
 
 BaseResults = Iterator[Res[BaseEvent]]
 

diff --git a/google_takeout_parser/locales/en.py b/google_takeout_parser/locales/en.py
@@ -4,6 +4,8 @@
     _parse_html_comment_file,
     _parse_json_activity,
     _parse_likes,
+    _parse_youtube_comments_csv,
+    _parse_youtube_live_chats_csv,
     _parse_app_installs,
     _parse_location_history,
     _parse_semantic_location_history,
@@ -48,18 +50,22 @@
     r"Location History/Location( )?History.json": _parse_location_history,  # old path to Location History
     r"Location History/Records.json": _parse_location_history,  # new path to Location History
     r"Location History/Settings.json": None,
+    r"Location History \(Timeline\)/Settings.json": None,
     # HTML/JSON activity-like files which aren't in 'My Activity'
     # optional " and Youtube Music" to handle pre-2017 data
     r"YouTube( and YouTube Music)?/history/.*?.html": _parse_html_activity,
     r"YouTube( and YouTube Music)?/history/.*?.json": _parse_json_activity,
     # basic list item files which have chat messages/comments
     r"YouTube( and YouTube Music)?/my-comments/.*?.html": _parse_html_comment_file,
+    r"YouTube( and YouTube Music)?/comments/comments.csv": _parse_youtube_comments_csv,
+    r"YouTube( and YouTube Music)?/live\s*chats/live\s*chats.csv": _parse_youtube_live_chats_csv,
     r"YouTube( and YouTube Music)?/my-live-chat-messages/.*?.html": _parse_html_comment_file,
     r"YouTube( and YouTube Music)?/playlists/likes.json": _parse_likes,
     r"YouTube( and YouTube Music)?/playlists/": None,
     r"YouTube( and YouTube Music)?/subscriptions": None,
     r"YouTube( and YouTube Music)?/videos": None,
     r"YouTube( and YouTube Music)?/music-uploads": None,
+    r"YouTube( and YouTube Music)?/channels/": None,
     r"My Activity/Assistant/.*.mp3": None,  # might be interesting to extract timestamps
     r"My Activity/Voice and Audio/.*.mp3": None,
     r"My Activity/Takeout": None,  # activity for when you made takeouts, dont need

diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py
@@ -48,10 +48,12 @@ class LocationInfo(NamedTuple):
     sourceUrl: Optional[Url]
 
 
+# fmt: off
 class BaseEvent(Protocol):
     @property
     def key(self) -> Any:
         ...
+# fmt: on
 
 
 @dataclass
@@ -84,6 +86,10 @@ def key(self) -> Tuple[str, str, int]:
 
 @dataclass
 class YoutubeComment(BaseEvent):
+    """
+    NOTE: this was the old format, the takeout.google.com returns a CSV file now instead, which is the model CSVYoutubeComment below
+    """
+
     content: str
     dt: datetime
     urls: List[Url]
@@ -93,6 +99,43 @@ def key(self) -> int:
         return int(self.dt.timestamp())
 
 
+@dataclass
+class CSVYoutubeComment(BaseEvent):
+    commentId: str
+    channelId: str
+    dt: datetime
+    price: Optional[str]
+    parentCommentId: Optional[str]
+    videoId: str
+    contentJSON: str
+
+    @property
+    def key(self) -> int:
+        return int(self.dt.timestamp())
+
+
+# considered re-using model above, but might be confusing
+# and its useful to know if a message was from a livestream
+# or a VOD
+@dataclass
+class CSVYoutubeLiveChat(BaseEvent):
+    """
+    this is very similar to CSVYoutubeComment, but chatId instead of commentId
+    and it cant have a parentCommentId
+    """
+
+    liveChatId: str
+    channelId: str
+    dt: datetime
+    price: Optional[str]
+    videoId: str
+    contentJSON: str
+
+    @property
+    def key(self) -> int:
+        return int(self.dt.timestamp())
+
+
 @dataclass
 class LikedYoutubeVideo(BaseEvent):
     title: str
@@ -203,6 +246,8 @@ def key(self) -> Tuple[str, int]:
     Location,
     ChromeHistory,
     YoutubeComment,
+    CSVYoutubeComment,
+    CSVYoutubeLiveChat,
     PlaceVisit,
 ]
 

diff --git a/google_takeout_parser/parse_csv.py b/google_takeout_parser/parse_csv.py
@@ -0,0 +1,111 @@
+import csv
+from pathlib import Path
+from typing import List, TextIO, Iterator
+
+from .models import CSVYoutubeComment, CSVYoutubeLiveChat
+from .common import Res
+from .time_utils import parse_json_utc_date
+
+
+def _parse_youtube_comment_row(row: List[str]) -> Res[CSVYoutubeComment]:
+    # Comment ID,Channel ID,Comment Create Timestamp,Price,Parent Comment ID,Video ID,Comment Text
+    try:
+        (
+            comment_id,
+            channel_id,
+            created_at,
+            price,
+            parent_comment_id,
+            video_id,
+            textJSON,
+        ) = row
+    except ValueError as e:
+        return e
+    return CSVYoutubeComment(
+        commentId=comment_id,
+        channelId=channel_id,
+        dt=parse_json_utc_date(created_at),
+        price=price,
+        parentCommentId=parent_comment_id if parent_comment_id.strip() else None,
+        videoId=video_id,
+        # for now just pass the contents of the message as JSON forwards,
+        # will add helpers that let the user access it in different ways programatically
+        # instead of trying to define every access pattern in a model
+        contentJSON=textJSON,
+    )
+
+
+def is_empty_row(row: List[str]) -> bool:
+    if len(row) == 0:
+        return True
+    for item in row:
+        if item.strip():
+            return False
+    return True
+
+
+def _parse_youtube_comments_buffer(
+    buf: TextIO,
+    skip_first: bool = True,
+) -> Iterator[Res[CSVYoutubeComment]]:
+    reader = csv.reader(buf)
+    if skip_first:
+        next(reader)
+    for row in reader:
+        if is_empty_row(row):
+            continue
+        if len(row) != 7:
+            yield ValueError(f"Expected 7 columns, got {len(row)}: {row}")
+            continue
+        yield _parse_youtube_comment_row(row)
+
+
+def _parse_youtube_comments_csv(path: Path) -> Iterator[Res[CSVYoutubeComment]]:
+    with path.open("r", newline="") as f:
+        yield from _parse_youtube_comments_buffer(f)
+
+
+# Live Chat ID,Channel ID,Live Chat Create Timestamp,Price,Video ID,Live Chat Text
+
+
+def _parse_youtube_live_chat_row(row: List[str]) -> Res[CSVYoutubeLiveChat]:
+    try:
+        (
+            live_chat_id,
+            channel_id,
+            created_at,
+            price,
+            video_id,
+            textJSON,
+        ) = row
+    except ValueError as e:
+        return e
+    return CSVYoutubeLiveChat(
+        liveChatId=live_chat_id,
+        channelId=channel_id,
+        dt=parse_json_utc_date(created_at),
+        price=price,
+        videoId=video_id,
+        contentJSON=textJSON,
+    )
+
+
+def _parse_youtube_live_chats_buffer(
+    buf: TextIO,
+    skip_first: bool = True,
+) -> Iterator[Res[CSVYoutubeLiveChat]]:
+    reader = csv.reader(buf)
+    if skip_first:
+        next(reader)
+    for row in reader:
+        if is_empty_row(row):
+            continue
+        if len(row) != 6:
+            yield ValueError(f"Expected 6 columns, got {len(row)}: {row}")
+            continue
+        yield _parse_youtube_live_chat_row(row)
+
+
+def _parse_youtube_live_chats_csv(path: Path) -> Iterator[Res[CSVYoutubeLiveChat]]:
+    with path.open("r", newline="") as f:
+        yield from _parse_youtube_live_chats_buffer(f)
diff --git a/google_takeout_parser/parse_youtube_chats.py b/google_takeout_parser/parse_youtube_chats.py
diff --git a/tests/test_csv.py b/tests/test_csv.py
@@ -0,0 +1,60 @@
+from io import StringIO
+from datetime import datetime, timezone
+from google_takeout_parser.parse_csv import (
+    _parse_youtube_comments_buffer,
+    _parse_youtube_live_chats_buffer,
+)
+from google_takeout_parser.models import CSVYoutubeComment, CSVYoutubeLiveChat
+
+
+def test_parse_youtube_comment_buffer() -> None:
+    text_content = """UgxtiXQkY7gqHbldJ1F4AaABAg,UCA6DtnbZ2KJckyTYfXOwQNA,2023-09-19T17:42:53.434647+00:00,0,,WtOskFeLmr4,"{""takeoutSegments"":[{""text"":""coalowl the legend""}]}"
+UgwDN8UeMxW4NDFbvY54AaABAg.9iwJkUYNcXa9u0lv3j3Abh,UCA6DtnbZ2KJckyTYfXOwQNA,2023-08-30T01:54:46.801024+00:00,0,UgwDN8UeMxW4NDFbvY54AaABAg,jH39c5-y6kg,"{""takeoutSegments"":[{""text"":""Ah, this is the reason why Ive never seen concurrent write failures myself, python's default timeout value is 5s, so it just waits in a busy loop if I have 'concurrent writers'""}]}"
+
+
+"""
+
+    buf = StringIO(text_content)
+
+    res = list(_parse_youtube_comments_buffer(buf, skip_first=False))
+    assert len(res) == 2
+
+    assert res[0] == CSVYoutubeComment(
+        commentId="UgxtiXQkY7gqHbldJ1F4AaABAg",
+        channelId="UCA6DtnbZ2KJckyTYfXOwQNA",
+        dt=datetime(2023, 9, 19, 17, 42, 53, 434647, tzinfo=timezone.utc),
+        price="0",
+        parentCommentId=None,
+        videoId="WtOskFeLmr4",
+        contentJSON='{"takeoutSegments":[{"text":"coalowl the legend"}]}',
+    )
+
+    assert res[1] == CSVYoutubeComment(
+        commentId="UgwDN8UeMxW4NDFbvY54AaABAg.9iwJkUYNcXa9u0lv3j3Abh",
+        channelId="UCA6DtnbZ2KJckyTYfXOwQNA",
+        dt=datetime(2023, 8, 30, 1, 54, 46, 801024, tzinfo=timezone.utc),
+        price="0",
+        parentCommentId="UgwDN8UeMxW4NDFbvY54AaABAg",
+        videoId="jH39c5-y6kg",
+        contentJSON='{"takeoutSegments":[{"text":"Ah, this is the reason why Ive never seen concurrent write failures myself, python\'s default timeout value is 5s, so it just waits in a busy loop if I have \'concurrent writers\'"}]}',
+    )
+
+
+def test_parse_youtube_live_chat_buffer() -> None:
+    text_content = """UgwsSD8yrDW7_h6F5vZ4AaABDqgB5OC1kgI,UCA6DtnbZ2KJckyTYfXOwQNA,2018-09-02T05:16:35.510381+00:00,0,0vGCh85obuI,"{""takeoutSegments"":[{""text"":""\""think the right thing\"" jeez""}]}"
+
+    """
+
+    buf = StringIO(text_content)
+
+    res = list(_parse_youtube_live_chats_buffer(buf, skip_first=False))
+    assert len(res) == 1
+
+    assert res[0] == CSVYoutubeLiveChat(
+        liveChatId="UgwsSD8yrDW7_h6F5vZ4AaABDqgB5OC1kgI",
+        channelId="UCA6DtnbZ2KJckyTYfXOwQNA",
+        dt=datetime(2018, 9, 2, 5, 16, 35, 510381, tzinfo=timezone.utc),
+        price="0",
+        videoId="0vGCh85obuI",
+        contentJSON='{"takeoutSegments":[{"text":"\"think the right thing\" jeez"}]}',
+    )