Skip to content

Commit

Permalink
parse_csv: fix youtube comments parsing due to changed format in June…
Browse files Browse the repository at this point in the history
… 2024
  • Loading branch information
karlicoss committed Sep 19, 2024
1 parent 70c0e0b commit e9fdf39
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 27 deletions.
35 changes: 11 additions & 24 deletions google_takeout_parser/parse_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,16 @@
from .time_utils import parse_json_utc_date


def _parse_youtube_comment_row(row: List[str]) -> Res[CSVYoutubeComment]:
# Comment ID,Channel ID,Comment Create Timestamp,Price,Parent Comment ID,Video ID,Comment Text
def _parse_youtube_comment_row(row: Dict[str, Any]) -> Res[CSVYoutubeComment]:
try:
(
comment_id,
channel_id,
created_at,
price,
parent_comment_id,
video_id,
textJSON,
) = row
except ValueError as e:
comment_id = row['Comment ID']
channel_id = row['Channel ID']
created_at = row['Comment Create Timestamp']
price = row['Price']
parent_comment_id = row['Parent Comment ID']
video_id = row['Video ID']
textJSON = row['Comment Text']
except KeyError as e:
return e
return CSVYoutubeComment(
commentId=comment_id,
Expand All @@ -46,19 +43,9 @@ def is_empty_row(row: List[str]) -> bool:
return True


def _parse_youtube_comments_buffer(
buf: TextIO,
skip_first: bool = True,
) -> Iterator[Res[CSVYoutubeComment]]:
reader = csv.reader(buf)
if skip_first:
next(reader)
def _parse_youtube_comments_buffer(buf: TextIO) -> Iterator[Res[CSVYoutubeComment]]:
reader = csv.DictReader(buf)
for row in reader:
if is_empty_row(row):
continue
if len(row) != 7:
yield ValueError(f"Expected 7 columns, got {len(row)}: {row}")
continue
yield _parse_youtube_comment_row(row)


Expand Down
48 changes: 45 additions & 3 deletions tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,23 @@
from google_takeout_parser.models import CSVYoutubeComment, CSVYoutubeLiveChat


def test_parse_youtube_comment_buffer() -> None:
text_content = """UgxtiXQkY7gqHbldJ1F4AaABAg,UCA6DtnbZ2KJckyTYfXOwQNA,2023-09-19T17:42:53.434647+00:00,0,,WtOskFeLmr4,"{""takeoutSegments"":[{""text"":""coalowl the legend""}]}"
def test_parse_youtube_comment_buffer_old() -> None:
"""Old format, pre June 2024"""

# deliberately add some new lines at the end -- real takeout also has them
text_content = """\
Comment ID,Channel ID,Comment Create Timestamp,Price,Parent Comment ID,Video ID,Comment Text
UgxtiXQkY7gqHbldJ1F4AaABAg,UCA6DtnbZ2KJckyTYfXOwQNA,2023-09-19T17:42:53.434647+00:00,0,,WtOskFeLmr4,"{""takeoutSegments"":[{""text"":""coalowl the legend""}]}"
UgwDN8UeMxW4NDFbvY54AaABAg.9iwJkUYNcXa9u0lv3j3Abh,UCA6DtnbZ2KJckyTYfXOwQNA,2023-08-30T01:54:46.801024+00:00,0,UgwDN8UeMxW4NDFbvY54AaABAg,jH39c5-y6kg,"{""takeoutSegments"":[{""text"":""Ah, this is the reason why Ive never seen concurrent write failures myself, python's default timeout value is 5s, so it just waits in a busy loop if I have 'concurrent writers'""}]}"
"""

buf = StringIO(text_content)

res = list(_parse_youtube_comments_buffer(buf, skip_first=False))
res = list(_parse_youtube_comments_buffer(buf))
assert len(res) == 2

assert res[0] == CSVYoutubeComment(
Expand All @@ -40,6 +47,41 @@ def test_parse_youtube_comment_buffer() -> None:
)


def test_parse_youtube_comment_buffer_new() -> None:
"""New format, post June 2024"""

text_content = """\
Channel ID,Comment Create Timestamp,Price,Comment ID,Parent Comment ID,Video ID,Comment Text
UCYnl1cugi7Lw1h8j6JNqNEg,2023-04-14T07:39:35.956042+00:00,0,UgytHqobEtqoKm_-pYB4AaABAg,,rWVAzS6duAs,"{""text"":""\u003e I am about to get buried in the concrete""},{""text"":""\n""},{""text"":""the most normal Veritasium video!""}"
UCYnl1cugi7Lw1h8j6JNqNEg,2016-01-29T18:26:53.255+00:00,0,UgiNMzGz_nAsjXfCoAEC,,ZuvK-oe647c,"{""text"":""Great illustration of Bell inequality!""}"
"""

buf = StringIO(text_content)
res = list(_parse_youtube_comments_buffer(buf))
assert len(res) == 2

assert res[0] == CSVYoutubeComment(
commentId="UgytHqobEtqoKm_-pYB4AaABAg",
channelId="UCYnl1cugi7Lw1h8j6JNqNEg",
dt=datetime(2023, 4, 14, 7, 39, 35, 956042, tzinfo=timezone.utc),
price="0",
parentCommentId=None,
videoId="rWVAzS6duAs",
contentJSON='{"text":"> I am about to get buried in the concrete"},{"text":"\n"},{"text":"the most normal Veritasium video!"}',
)
assert res[1] == CSVYoutubeComment(
commentId="UgiNMzGz_nAsjXfCoAEC",
channelId="UCYnl1cugi7Lw1h8j6JNqNEg",
dt=datetime(2016, 1, 29, 18, 26, 53, 255000, tzinfo=timezone.utc),
price="0",
parentCommentId=None,
videoId="ZuvK-oe647c",
contentJSON='{"text":"Great illustration of Bell inequality!"}',
)


def test_parse_youtube_live_chat_buffer() -> None:
text_content = """UgwsSD8yrDW7_h6F5vZ4AaABDqgB5OC1kgI,UCA6DtnbZ2KJckyTYfXOwQNA,2018-09-02T05:16:35.510381+00:00,0,0vGCh85obuI,"{""takeoutSegments"":[{""text"":""\""think the right thing\"" jeez""}]}"
Expand Down

0 comments on commit e9fdf39

Please sign in to comment.