fix for youtube comments csv + old Chrome/MyActivity.json (#73)

* parse_json_activity: fix missing "header" in Chrome/MyActivity Handles these without errors: ``` {'title': 'Visited view-source:http://127.0.0.1:8000/annotating.html', 'titleUrl': 'https://www.google.com/url?q=view-source:http://127.0.0.1:8000/annotating.html&usg=AFQjCNFSMQX8pgDQhBd-j1Um3T0bFO1Zzg', 'time': '2019-07-20T23:50:11.333Z', 'products': ['Chrome']} ``` * parse_csv: fix youtube comments parsing due to changed format in June 2024
purarue · Sep 19, 2024 · e801e59 · e801e59
1 parent da1bc8d
commit e801e59
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 28 deletions.
diff --git a/google_takeout_parser/parse_csv.py b/google_takeout_parser/parse_csv.py
@@ -9,19 +9,16 @@
 from .time_utils import parse_json_utc_date
 
 
-def _parse_youtube_comment_row(row: List[str]) -> Res[CSVYoutubeComment]:
-    # Comment ID,Channel ID,Comment Create Timestamp,Price,Parent Comment ID,Video ID,Comment Text
+def _parse_youtube_comment_row(row: Dict[str, Any]) -> Res[CSVYoutubeComment]:
     try:
-        (
-            comment_id,
-            channel_id,
-            created_at,
-            price,
-            parent_comment_id,
-            video_id,
-            textJSON,
-        ) = row
-    except ValueError as e:
+        comment_id = row['Comment ID']
+        channel_id = row['Channel ID']
+        created_at = row['Comment Create Timestamp']
+        price = row['Price']
+        parent_comment_id = row['Parent Comment ID']
+        video_id = row['Video ID']
+        textJSON = row['Comment Text']
+    except KeyError as e:
         return e
     return CSVYoutubeComment(
         commentId=comment_id,
@@ -46,19 +43,9 @@ def is_empty_row(row: List[str]) -> bool:
     return True
 
 
-def _parse_youtube_comments_buffer(
-    buf: TextIO,
-    skip_first: bool = True,
-) -> Iterator[Res[CSVYoutubeComment]]:
-    reader = csv.reader(buf)
-    if skip_first:
-        next(reader)
+def _parse_youtube_comments_buffer(buf: TextIO) -> Iterator[Res[CSVYoutubeComment]]:
+    reader = csv.DictReader(buf)
     for row in reader:
-        if is_empty_row(row):
-            continue
-        if len(row) != 7:
-            yield ValueError(f"Expected 7 columns, got {len(row)}: {row}")
-            continue
         yield _parse_youtube_comment_row(row)
 
 

diff --git a/google_takeout_parser/parse_json.py b/google_takeout_parser/parse_json.py
@@ -63,7 +63,14 @@ def _parse_json_activity(p: Path) -> Iterator[Res[Activity]]:
                 header = "YouTube"  # didn't have header
                 time_str = blob["publishedAt"]
             else:
-                header = blob["header"]
+                _header = blob.get("header")
+                if _header is None:
+                    # some pre-2021 MyActivity/Chrome/MyActivty.json contain a few items without header
+                    # they always seem to be originating from viewing page source
+                    if blob["title"].startswith("Visited view-source:"):
+                        _header = "Chrome"
+                assert _header is not None, blob
+                header = _header
                 time_str = blob["time"]
 
             yield Activity(

diff --git a/tests/test_csv.py b/tests/test_csv.py
@@ -7,16 +7,23 @@
 from google_takeout_parser.models import CSVYoutubeComment, CSVYoutubeLiveChat
 
 
-def test_parse_youtube_comment_buffer() -> None:
-    text_content = """UgxtiXQkY7gqHbldJ1F4AaABAg,UCA6DtnbZ2KJckyTYfXOwQNA,2023-09-19T17:42:53.434647+00:00,0,,WtOskFeLmr4,"{""takeoutSegments"":[{""text"":""coalowl the legend""}]}"
+def test_parse_youtube_comment_buffer_old() -> None:
+    """Old format, pre June 2024"""
+
+    # deliberately add some new lines at the end -- real takeout also has them
+    text_content = """\
+Comment ID,Channel ID,Comment Create Timestamp,Price,Parent Comment ID,Video ID,Comment Text
+UgxtiXQkY7gqHbldJ1F4AaABAg,UCA6DtnbZ2KJckyTYfXOwQNA,2023-09-19T17:42:53.434647+00:00,0,,WtOskFeLmr4,"{""takeoutSegments"":[{""text"":""coalowl the legend""}]}"
 UgwDN8UeMxW4NDFbvY54AaABAg.9iwJkUYNcXa9u0lv3j3Abh,UCA6DtnbZ2KJckyTYfXOwQNA,2023-08-30T01:54:46.801024+00:00,0,UgwDN8UeMxW4NDFbvY54AaABAg,jH39c5-y6kg,"{""takeoutSegments"":[{""text"":""Ah, this is the reason why Ive never seen concurrent write failures myself, python's default timeout value is 5s, so it just waits in a busy loop if I have 'concurrent writers'""}]}"
 
 
+
+
 """
 
     buf = StringIO(text_content)
 
-    res = list(_parse_youtube_comments_buffer(buf, skip_first=False))
+    res = list(_parse_youtube_comments_buffer(buf))
     assert len(res) == 2
 
     assert res[0] == CSVYoutubeComment(
@@ -40,6 +47,41 @@ def test_parse_youtube_comment_buffer() -> None:
     )
 
 
+def test_parse_youtube_comment_buffer_new() -> None:
+    """New format, post June 2024"""
+
+    text_content = """\
+Channel ID,Comment Create Timestamp,Price,Comment ID,Parent Comment ID,Video ID,Comment Text
+UCYnl1cugi7Lw1h8j6JNqNEg,2023-04-14T07:39:35.956042+00:00,0,UgytHqobEtqoKm_-pYB4AaABAg,,rWVAzS6duAs,"{""text"":""\u003e I am about to get buried in the concrete""},{""text"":""\n""},{""text"":""the most normal  Veritasium video!""}"
+UCYnl1cugi7Lw1h8j6JNqNEg,2016-01-29T18:26:53.255+00:00,0,UgiNMzGz_nAsjXfCoAEC,,ZuvK-oe647c,"{""text"":""Great illustration of Bell inequality!""}"
+
+
+"""
+
+    buf = StringIO(text_content)
+    res = list(_parse_youtube_comments_buffer(buf))
+    assert len(res) == 2
+
+    assert res[0] == CSVYoutubeComment(
+        commentId="UgytHqobEtqoKm_-pYB4AaABAg",
+        channelId="UCYnl1cugi7Lw1h8j6JNqNEg",
+        dt=datetime(2023, 4, 14, 7, 39, 35, 956042, tzinfo=timezone.utc),
+        price="0",
+        parentCommentId=None,
+        videoId="rWVAzS6duAs",
+        contentJSON='{"text":"> I am about to get buried in the concrete"},{"text":"\n"},{"text":"the most normal  Veritasium video!"}',
+    )
+    assert res[1] == CSVYoutubeComment(
+        commentId="UgiNMzGz_nAsjXfCoAEC",
+        channelId="UCYnl1cugi7Lw1h8j6JNqNEg",
+        dt=datetime(2016, 1, 29, 18, 26, 53, 255000, tzinfo=timezone.utc),
+        price="0",
+        parentCommentId=None,
+        videoId="ZuvK-oe647c",
+        contentJSON='{"text":"Great illustration of Bell inequality!"}',
+    )
+
+
 def test_parse_youtube_live_chat_buffer() -> None:
     text_content = """UgwsSD8yrDW7_h6F5vZ4AaABDqgB5OC1kgI,UCA6DtnbZ2KJckyTYfXOwQNA,2018-09-02T05:16:35.510381+00:00,0,0vGCh85obuI,"{""takeoutSegments"":[{""text"":""\""think the right thing\"" jeez""}]}"