From 372f002cdf1ed2ceea070dde788075dbb2085c05 Mon Sep 17 00:00:00 2001
From: Sean Breckenridge <seanbrecke@gmail.com>
Date: Sat, 10 Feb 2024 17:37:57 -0800
Subject: [PATCH] add csv youtube/live chat parsing

---
 README.md                                    |   8 +-
 google_takeout_parser/locales/common.py      |   4 +
 google_takeout_parser/locales/en.py          |   6 +
 google_takeout_parser/models.py              |  45 ++++++++
 google_takeout_parser/parse_csv.py           | 111 +++++++++++++++++++
 google_takeout_parser/parse_youtube_chats.py |   0
 tests/test_csv.py                            |  60 ++++++++++
 7 files changed, 233 insertions(+), 1 deletion(-)
 create mode 100644 google_takeout_parser/parse_csv.py
 create mode 100644 google_takeout_parser/parse_youtube_chats.py
 create mode 100644 tests/test_csv.py

diff --git a/README.md b/README.md
index 191f686..a0db754 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,11 @@ This currently parses:
   - Location History `Location History/Location History.json`, `Location History/Records.json`
 - Youtube:
   - History - `YouTube and YouTube Music/history/*.html|*.json`
-  - Comments - `YouTube and YouTube Music/my-comments/*.html`
+  - Comments
+    - Legacy HTML Comment format: `YouTube and YouTube Music/my-comments/*.html`
+    - New CSV/JSON format (mostly CSV, but the comment contents itself are a JSON blob):
+      - `Youtube/comments/comments.csv`
+      - `Youtube/live chats/live chats.csv`
   - Live Chat Messages - `YouTube and YouTube Music/my-live-chat-messages/*.html`
   - Likes: `YouTube and YouTube Music/playlists/likes.json`
 
@@ -218,6 +222,8 @@ Dont feel required to add support for all locales, its somewhat annoying to swap
 
 Though, if your takeout is in some language this doesn't support, you can [create an issue](https://github.com/seanbreckenridge/google_takeout_parser/issues/new?title=support+new+locale) with the file structure (run `find Takeout` and/or `tree Takeout`), or contribute a locale file by creating a `path -> function mapping`, and adding it to the global `LOCALES` variables in `locales/all.py` and `locales/main.py`
 
+This is a pretty difficult to maintain, as it requires a lot of manual testing from people who have access to these takeouts, and who actively use the language that the takeout is in. My google accounts main language is English, so I upkeep that locale whenever I notice changes, but its not trivial to port those changes to other locales without swapping my language, making an export, waiting, and then switching back. I keep track of mismatched changes [in this board](https://github.com/users/seanbreckenridge/projects/1/views/1)
+
 Ideally, you would select everything when doing a takeout (not just the `My Activity`/`Chrome`/`Location History` like I suggested above), so [paths that are not parsed can be ignored properly](https://github.com/seanbreckenridge/google_takeout_parser/blob/4981c241c04b5b37265710dcc6ca00f19d1eafb4/google_takeout_parser/locales/en.py#L105C1-L113).
 
 ### Testing
diff --git a/google_takeout_parser/locales/common.py b/google_takeout_parser/locales/common.py
index 3bc50aa..85ffe85 100644
--- a/google_takeout_parser/locales/common.py
+++ b/google_takeout_parser/locales/common.py
@@ -12,6 +12,10 @@
     _parse_semantic_location_history,
     _parse_chrome_history,
 )
+from ..parse_csv import (  # noqa: F401
+    _parse_youtube_comments_csv,
+    _parse_youtube_live_chats_csv,
+)
 
 BaseResults = Iterator[Res[BaseEvent]]
 
diff --git a/google_takeout_parser/locales/en.py b/google_takeout_parser/locales/en.py
index 769c669..3752dc3 100644
--- a/google_takeout_parser/locales/en.py
+++ b/google_takeout_parser/locales/en.py
@@ -4,6 +4,8 @@
     _parse_html_comment_file,
     _parse_json_activity,
     _parse_likes,
+    _parse_youtube_comments_csv,
+    _parse_youtube_live_chats_csv,
     _parse_app_installs,
     _parse_location_history,
     _parse_semantic_location_history,
@@ -48,18 +50,22 @@
     r"Location History/Location( )?History.json": _parse_location_history,  # old path to Location History
     r"Location History/Records.json": _parse_location_history,  # new path to Location History
     r"Location History/Settings.json": None,
+    r"Location History \(Timeline\)/Settings.json": None,
     # HTML/JSON activity-like files which aren't in 'My Activity'
     # optional " and Youtube Music" to handle pre-2017 data
     r"YouTube( and YouTube Music)?/history/.*?.html": _parse_html_activity,
     r"YouTube( and YouTube Music)?/history/.*?.json": _parse_json_activity,
     # basic list item files which have chat messages/comments
     r"YouTube( and YouTube Music)?/my-comments/.*?.html": _parse_html_comment_file,
+    r"YouTube( and YouTube Music)?/comments/comments.csv": _parse_youtube_comments_csv,
+    r"YouTube( and YouTube Music)?/live\s*chats/live\s*chats.csv": _parse_youtube_live_chats_csv,
     r"YouTube( and YouTube Music)?/my-live-chat-messages/.*?.html": _parse_html_comment_file,
     r"YouTube( and YouTube Music)?/playlists/likes.json": _parse_likes,
     r"YouTube( and YouTube Music)?/playlists/": None,
     r"YouTube( and YouTube Music)?/subscriptions": None,
     r"YouTube( and YouTube Music)?/videos": None,
     r"YouTube( and YouTube Music)?/music-uploads": None,
+    r"YouTube( and YouTube Music)?/channels/": None,
     r"My Activity/Assistant/.*.mp3": None,  # might be interesting to extract timestamps
     r"My Activity/Voice and Audio/.*.mp3": None,
     r"My Activity/Takeout": None,  # activity for when you made takeouts, dont need
diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py
index b3bd59b..8160601 100644
--- a/google_takeout_parser/models.py
+++ b/google_takeout_parser/models.py
@@ -48,10 +48,12 @@ class LocationInfo(NamedTuple):
     sourceUrl: Optional[Url]
 
 
+# fmt: off
 class BaseEvent(Protocol):
     @property
     def key(self) -> Any:
         ...
+# fmt: on
 
 
 @dataclass
@@ -84,6 +86,10 @@ def key(self) -> Tuple[str, str, int]:
 
 @dataclass
 class YoutubeComment(BaseEvent):
+    """
+    NOTE: this was the old format, the takeout.google.com returns a CSV file now instead, which is the model CSVYoutubeComment below
+    """
+
     content: str
     dt: datetime
     urls: List[Url]
@@ -93,6 +99,43 @@ def key(self) -> int:
         return int(self.dt.timestamp())
 
 
+@dataclass
+class CSVYoutubeComment(BaseEvent):
+    commentId: str
+    channelId: str
+    dt: datetime
+    price: Optional[str]
+    parentCommentId: Optional[str]
+    videoId: str
+    contentJSON: str
+
+    @property
+    def key(self) -> int:
+        return int(self.dt.timestamp())
+
+
+# considered re-using model above, but might be confusing
+# and its useful to know if a message was from a livestream
+# or a VOD
+@dataclass
+class CSVYoutubeLiveChat(BaseEvent):
+    """
+    this is very similar to CSVYoutubeComment, but chatId instead of commentId
+    and it cant have a parentCommentId
+    """
+
+    liveChatId: str
+    channelId: str
+    dt: datetime
+    price: Optional[str]
+    videoId: str
+    contentJSON: str
+
+    @property
+    def key(self) -> int:
+        return int(self.dt.timestamp())
+
+
 @dataclass
 class LikedYoutubeVideo(BaseEvent):
     title: str
@@ -203,6 +246,8 @@ def key(self) -> Tuple[str, int]:
     Location,
     ChromeHistory,
     YoutubeComment,
+    CSVYoutubeComment,
+    CSVYoutubeLiveChat,
     PlaceVisit,
 ]
 
diff --git a/google_takeout_parser/parse_csv.py b/google_takeout_parser/parse_csv.py
new file mode 100644
index 0000000..c6f2700
--- /dev/null
+++ b/google_takeout_parser/parse_csv.py
@@ -0,0 +1,111 @@
+import csv
+from pathlib import Path
+from typing import List, TextIO, Iterator
+
+from .models import CSVYoutubeComment, CSVYoutubeLiveChat
+from .common import Res
+from .time_utils import parse_json_utc_date
+
+
+def _parse_youtube_comment_row(row: List[str]) -> Res[CSVYoutubeComment]:
+    # Comment ID,Channel ID,Comment Create Timestamp,Price,Parent Comment ID,Video ID,Comment Text
+    try:
+        (
+            comment_id,
+            channel_id,
+            created_at,
+            price,
+            parent_comment_id,
+            video_id,
+            textJSON,
+        ) = row
+    except ValueError as e:
+        return e
+    return CSVYoutubeComment(
+        commentId=comment_id,
+        channelId=channel_id,
+        dt=parse_json_utc_date(created_at),
+        price=price,
+        parentCommentId=parent_comment_id if parent_comment_id.strip() else None,
+        videoId=video_id,
+        # for now just pass the contents of the message as JSON forwards,
+        # will add helpers that let the user access it in different ways programatically
+        # instead of trying to define every access pattern in a model
+        contentJSON=textJSON,
+    )
+
+
+def is_empty_row(row: List[str]) -> bool:
+    if len(row) == 0:
+        return True
+    for item in row:
+        if item.strip():
+            return False
+    return True
+
+
+def _parse_youtube_comments_buffer(
+    buf: TextIO,
+    skip_first: bool = True,
+) -> Iterator[Res[CSVYoutubeComment]]:
+    reader = csv.reader(buf)
+    if skip_first:
+        next(reader)
+    for row in reader:
+        if is_empty_row(row):
+            continue
+        if len(row) != 7:
+            yield ValueError(f"Expected 7 columns, got {len(row)}: {row}")
+            continue
+        yield _parse_youtube_comment_row(row)
+
+
+def _parse_youtube_comments_csv(path: Path) -> Iterator[Res[CSVYoutubeComment]]:
+    with path.open("r", newline="") as f:
+        yield from _parse_youtube_comments_buffer(f)
+
+
+# Live Chat ID,Channel ID,Live Chat Create Timestamp,Price,Video ID,Live Chat Text
+
+
+def _parse_youtube_live_chat_row(row: List[str]) -> Res[CSVYoutubeLiveChat]:
+    try:
+        (
+            live_chat_id,
+            channel_id,
+            created_at,
+            price,
+            video_id,
+            textJSON,
+        ) = row
+    except ValueError as e:
+        return e
+    return CSVYoutubeLiveChat(
+        liveChatId=live_chat_id,
+        channelId=channel_id,
+        dt=parse_json_utc_date(created_at),
+        price=price,
+        videoId=video_id,
+        contentJSON=textJSON,
+    )
+
+
+def _parse_youtube_live_chats_buffer(
+    buf: TextIO,
+    skip_first: bool = True,
+) -> Iterator[Res[CSVYoutubeLiveChat]]:
+    reader = csv.reader(buf)
+    if skip_first:
+        next(reader)
+    for row in reader:
+        if is_empty_row(row):
+            continue
+        if len(row) != 6:
+            yield ValueError(f"Expected 6 columns, got {len(row)}: {row}")
+            continue
+        yield _parse_youtube_live_chat_row(row)
+
+
+def _parse_youtube_live_chats_csv(path: Path) -> Iterator[Res[CSVYoutubeLiveChat]]:
+    with path.open("r", newline="") as f:
+        yield from _parse_youtube_live_chats_buffer(f)
diff --git a/google_takeout_parser/parse_youtube_chats.py b/google_takeout_parser/parse_youtube_chats.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_csv.py b/tests/test_csv.py
new file mode 100644
index 0000000..f16e22b
--- /dev/null
+++ b/tests/test_csv.py
@@ -0,0 +1,60 @@
+from io import StringIO
+from datetime import datetime, timezone
+from google_takeout_parser.parse_csv import (
+    _parse_youtube_comments_buffer,
+    _parse_youtube_live_chats_buffer,
+)
+from google_takeout_parser.models import CSVYoutubeComment, CSVYoutubeLiveChat
+
+
+def test_parse_youtube_comment_buffer() -> None:
+    text_content = """UgxtiXQkY7gqHbldJ1F4AaABAg,UCA6DtnbZ2KJckyTYfXOwQNA,2023-09-19T17:42:53.434647+00:00,0,,WtOskFeLmr4,"{""takeoutSegments"":[{""text"":""coalowl the legend""}]}"
+UgwDN8UeMxW4NDFbvY54AaABAg.9iwJkUYNcXa9u0lv3j3Abh,UCA6DtnbZ2KJckyTYfXOwQNA,2023-08-30T01:54:46.801024+00:00,0,UgwDN8UeMxW4NDFbvY54AaABAg,jH39c5-y6kg,"{""takeoutSegments"":[{""text"":""Ah, this is the reason why Ive never seen concurrent write failures myself, python's default timeout value is 5s, so it just waits in a busy loop if I have 'concurrent writers'""}]}"
+
+
+"""
+
+    buf = StringIO(text_content)
+
+    res = list(_parse_youtube_comments_buffer(buf, skip_first=False))
+    assert len(res) == 2
+
+    assert res[0] == CSVYoutubeComment(
+        commentId="UgxtiXQkY7gqHbldJ1F4AaABAg",
+        channelId="UCA6DtnbZ2KJckyTYfXOwQNA",
+        dt=datetime(2023, 9, 19, 17, 42, 53, 434647, tzinfo=timezone.utc),
+        price="0",
+        parentCommentId=None,
+        videoId="WtOskFeLmr4",
+        contentJSON='{"takeoutSegments":[{"text":"coalowl the legend"}]}',
+    )
+
+    assert res[1] == CSVYoutubeComment(
+        commentId="UgwDN8UeMxW4NDFbvY54AaABAg.9iwJkUYNcXa9u0lv3j3Abh",
+        channelId="UCA6DtnbZ2KJckyTYfXOwQNA",
+        dt=datetime(2023, 8, 30, 1, 54, 46, 801024, tzinfo=timezone.utc),
+        price="0",
+        parentCommentId="UgwDN8UeMxW4NDFbvY54AaABAg",
+        videoId="jH39c5-y6kg",
+        contentJSON='{"takeoutSegments":[{"text":"Ah, this is the reason why Ive never seen concurrent write failures myself, python\'s default timeout value is 5s, so it just waits in a busy loop if I have \'concurrent writers\'"}]}',
+    )
+
+
+def test_parse_youtube_live_chat_buffer() -> None:
+    text_content = """UgwsSD8yrDW7_h6F5vZ4AaABDqgB5OC1kgI,UCA6DtnbZ2KJckyTYfXOwQNA,2018-09-02T05:16:35.510381+00:00,0,0vGCh85obuI,"{""takeoutSegments"":[{""text"":""\""think the right thing\"" jeez""}]}"
+
+    """
+
+    buf = StringIO(text_content)
+
+    res = list(_parse_youtube_live_chats_buffer(buf, skip_first=False))
+    assert len(res) == 1
+
+    assert res[0] == CSVYoutubeLiveChat(
+        liveChatId="UgwsSD8yrDW7_h6F5vZ4AaABDqgB5OC1kgI",
+        channelId="UCA6DtnbZ2KJckyTYfXOwQNA",
+        dt=datetime(2018, 9, 2, 5, 16, 35, 510381, tzinfo=timezone.utc),
+        price="0",
+        videoId="0vGCh85obuI",
+        contentJSON='{"takeoutSegments":[{"text":"\"think the right thing\" jeez"}]}',
+    )