Skip to content

Commit

Permalink
fix for youtube comments csv + old Chrome/MyActivity.json (#73)
Browse files Browse the repository at this point in the history
* parse_json_activity: fix missing "header" in Chrome/MyActivity

Handles these without errors:
```
{'title': 'Visited view-source:http://127.0.0.1:8000/annotating.html', 'titleUrl': 'https://www.google.com/url?q=view-source:http://127.0.0.1:8000/annotating.html&usg=AFQjCNFSMQX8pgDQhBd-j1Um3T0bFO1Zzg', 'time': '2019-07-20T23:50:11.333Z', 'products': ['Chrome']}
```

* parse_csv: fix youtube comments parsing due to changed format in June 2024
  • Loading branch information
karlicoss authored Sep 19, 2024
1 parent da1bc8d commit e801e59
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 28 deletions.
35 changes: 11 additions & 24 deletions google_takeout_parser/parse_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,16 @@
from .time_utils import parse_json_utc_date


def _parse_youtube_comment_row(row: List[str]) -> Res[CSVYoutubeComment]:
# Comment ID,Channel ID,Comment Create Timestamp,Price,Parent Comment ID,Video ID,Comment Text
def _parse_youtube_comment_row(row: Dict[str, Any]) -> Res[CSVYoutubeComment]:
try:
(
comment_id,
channel_id,
created_at,
price,
parent_comment_id,
video_id,
textJSON,
) = row
except ValueError as e:
comment_id = row['Comment ID']
channel_id = row['Channel ID']
created_at = row['Comment Create Timestamp']
price = row['Price']
parent_comment_id = row['Parent Comment ID']
video_id = row['Video ID']
textJSON = row['Comment Text']
except KeyError as e:
return e
return CSVYoutubeComment(
commentId=comment_id,
Expand All @@ -46,19 +43,9 @@ def is_empty_row(row: List[str]) -> bool:
return True


def _parse_youtube_comments_buffer(
buf: TextIO,
skip_first: bool = True,
) -> Iterator[Res[CSVYoutubeComment]]:
reader = csv.reader(buf)
if skip_first:
next(reader)
def _parse_youtube_comments_buffer(buf: TextIO) -> Iterator[Res[CSVYoutubeComment]]:
reader = csv.DictReader(buf)
for row in reader:
if is_empty_row(row):
continue
if len(row) != 7:
yield ValueError(f"Expected 7 columns, got {len(row)}: {row}")
continue
yield _parse_youtube_comment_row(row)


Expand Down
9 changes: 8 additions & 1 deletion google_takeout_parser/parse_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,14 @@ def _parse_json_activity(p: Path) -> Iterator[Res[Activity]]:
header = "YouTube" # didn't have header
time_str = blob["publishedAt"]
else:
header = blob["header"]
_header = blob.get("header")
if _header is None:
# some pre-2021 MyActivity/Chrome/MyActivty.json contain a few items without header
# they always seem to be originating from viewing page source
if blob["title"].startswith("Visited view-source:"):
_header = "Chrome"
assert _header is not None, blob
header = _header
time_str = blob["time"]

yield Activity(
Expand Down
48 changes: 45 additions & 3 deletions tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,23 @@
from google_takeout_parser.models import CSVYoutubeComment, CSVYoutubeLiveChat


def test_parse_youtube_comment_buffer() -> None:
text_content = """UgxtiXQkY7gqHbldJ1F4AaABAg,UCA6DtnbZ2KJckyTYfXOwQNA,2023-09-19T17:42:53.434647+00:00,0,,WtOskFeLmr4,"{""takeoutSegments"":[{""text"":""coalowl the legend""}]}"
def test_parse_youtube_comment_buffer_old() -> None:
"""Old format, pre June 2024"""

# deliberately add some new lines at the end -- real takeout also has them
text_content = """\
Comment ID,Channel ID,Comment Create Timestamp,Price,Parent Comment ID,Video ID,Comment Text
UgxtiXQkY7gqHbldJ1F4AaABAg,UCA6DtnbZ2KJckyTYfXOwQNA,2023-09-19T17:42:53.434647+00:00,0,,WtOskFeLmr4,"{""takeoutSegments"":[{""text"":""coalowl the legend""}]}"
UgwDN8UeMxW4NDFbvY54AaABAg.9iwJkUYNcXa9u0lv3j3Abh,UCA6DtnbZ2KJckyTYfXOwQNA,2023-08-30T01:54:46.801024+00:00,0,UgwDN8UeMxW4NDFbvY54AaABAg,jH39c5-y6kg,"{""takeoutSegments"":[{""text"":""Ah, this is the reason why Ive never seen concurrent write failures myself, python's default timeout value is 5s, so it just waits in a busy loop if I have 'concurrent writers'""}]}"
"""

buf = StringIO(text_content)

res = list(_parse_youtube_comments_buffer(buf, skip_first=False))
res = list(_parse_youtube_comments_buffer(buf))
assert len(res) == 2

assert res[0] == CSVYoutubeComment(
Expand All @@ -40,6 +47,41 @@ def test_parse_youtube_comment_buffer() -> None:
)


def test_parse_youtube_comment_buffer_new() -> None:
"""New format, post June 2024"""

text_content = """\
Channel ID,Comment Create Timestamp,Price,Comment ID,Parent Comment ID,Video ID,Comment Text
UCYnl1cugi7Lw1h8j6JNqNEg,2023-04-14T07:39:35.956042+00:00,0,UgytHqobEtqoKm_-pYB4AaABAg,,rWVAzS6duAs,"{""text"":""\u003e I am about to get buried in the concrete""},{""text"":""\n""},{""text"":""the most normal Veritasium video!""}"
UCYnl1cugi7Lw1h8j6JNqNEg,2016-01-29T18:26:53.255+00:00,0,UgiNMzGz_nAsjXfCoAEC,,ZuvK-oe647c,"{""text"":""Great illustration of Bell inequality!""}"
"""

buf = StringIO(text_content)
res = list(_parse_youtube_comments_buffer(buf))
assert len(res) == 2

assert res[0] == CSVYoutubeComment(
commentId="UgytHqobEtqoKm_-pYB4AaABAg",
channelId="UCYnl1cugi7Lw1h8j6JNqNEg",
dt=datetime(2023, 4, 14, 7, 39, 35, 956042, tzinfo=timezone.utc),
price="0",
parentCommentId=None,
videoId="rWVAzS6duAs",
contentJSON='{"text":"> I am about to get buried in the concrete"},{"text":"\n"},{"text":"the most normal Veritasium video!"}',
)
assert res[1] == CSVYoutubeComment(
commentId="UgiNMzGz_nAsjXfCoAEC",
channelId="UCYnl1cugi7Lw1h8j6JNqNEg",
dt=datetime(2016, 1, 29, 18, 26, 53, 255000, tzinfo=timezone.utc),
price="0",
parentCommentId=None,
videoId="ZuvK-oe647c",
contentJSON='{"text":"Great illustration of Bell inequality!"}',
)


def test_parse_youtube_live_chat_buffer() -> None:
text_content = """UgwsSD8yrDW7_h6F5vZ4AaABDqgB5OC1kgI,UCA6DtnbZ2KJckyTYfXOwQNA,2018-09-02T05:16:35.510381+00:00,0,0vGCh85obuI,"{""takeoutSegments"":[{""text"":""\""think the right thing\"" jeez""}]}"
Expand Down

0 comments on commit e801e59

Please sign in to comment.