-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
233 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import csv | ||
from pathlib import Path | ||
from typing import List, TextIO, Iterator | ||
|
||
from .models import CSVYoutubeComment, CSVYoutubeLiveChat | ||
from .common import Res | ||
from .time_utils import parse_json_utc_date | ||
|
||
|
||
def _parse_youtube_comment_row(row: List[str]) -> Res[CSVYoutubeComment]: | ||
# Comment ID,Channel ID,Comment Create Timestamp,Price,Parent Comment ID,Video ID,Comment Text | ||
try: | ||
( | ||
comment_id, | ||
channel_id, | ||
created_at, | ||
price, | ||
parent_comment_id, | ||
video_id, | ||
textJSON, | ||
) = row | ||
except ValueError as e: | ||
return e | ||
return CSVYoutubeComment( | ||
commentId=comment_id, | ||
channelId=channel_id, | ||
dt=parse_json_utc_date(created_at), | ||
price=price, | ||
parentCommentId=parent_comment_id if parent_comment_id.strip() else None, | ||
videoId=video_id, | ||
# for now just pass the contents of the message as JSON forwards, | ||
# will add helpers that let the user access it in different ways programatically | ||
# instead of trying to define every access pattern in a model | ||
contentJSON=textJSON, | ||
) | ||
|
||
|
||
def is_empty_row(row: List[str]) -> bool: | ||
if len(row) == 0: | ||
return True | ||
for item in row: | ||
if item.strip(): | ||
return False | ||
return True | ||
|
||
|
||
def _parse_youtube_comments_buffer( | ||
buf: TextIO, | ||
skip_first: bool = True, | ||
) -> Iterator[Res[CSVYoutubeComment]]: | ||
reader = csv.reader(buf) | ||
if skip_first: | ||
next(reader) | ||
for row in reader: | ||
if is_empty_row(row): | ||
continue | ||
if len(row) != 7: | ||
yield ValueError(f"Expected 7 columns, got {len(row)}: {row}") | ||
continue | ||
yield _parse_youtube_comment_row(row) | ||
|
||
|
||
def _parse_youtube_comments_csv(path: Path) -> Iterator[Res[CSVYoutubeComment]]: | ||
with path.open("r", newline="") as f: | ||
yield from _parse_youtube_comments_buffer(f) | ||
|
||
|
||
# Live Chat ID,Channel ID,Live Chat Create Timestamp,Price,Video ID,Live Chat Text | ||
|
||
|
||
def _parse_youtube_live_chat_row(row: List[str]) -> Res[CSVYoutubeLiveChat]: | ||
try: | ||
( | ||
live_chat_id, | ||
channel_id, | ||
created_at, | ||
price, | ||
video_id, | ||
textJSON, | ||
) = row | ||
except ValueError as e: | ||
return e | ||
return CSVYoutubeLiveChat( | ||
liveChatId=live_chat_id, | ||
channelId=channel_id, | ||
dt=parse_json_utc_date(created_at), | ||
price=price, | ||
videoId=video_id, | ||
contentJSON=textJSON, | ||
) | ||
|
||
|
||
def _parse_youtube_live_chats_buffer( | ||
buf: TextIO, | ||
skip_first: bool = True, | ||
) -> Iterator[Res[CSVYoutubeLiveChat]]: | ||
reader = csv.reader(buf) | ||
if skip_first: | ||
next(reader) | ||
for row in reader: | ||
if is_empty_row(row): | ||
continue | ||
if len(row) != 6: | ||
yield ValueError(f"Expected 6 columns, got {len(row)}: {row}") | ||
continue | ||
yield _parse_youtube_live_chat_row(row) | ||
|
||
|
||
def _parse_youtube_live_chats_csv(path: Path) -> Iterator[Res[CSVYoutubeLiveChat]]: | ||
with path.open("r", newline="") as f: | ||
yield from _parse_youtube_live_chats_buffer(f) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
from io import StringIO | ||
from datetime import datetime, timezone | ||
from google_takeout_parser.parse_csv import ( | ||
_parse_youtube_comments_buffer, | ||
_parse_youtube_live_chats_buffer, | ||
) | ||
from google_takeout_parser.models import CSVYoutubeComment, CSVYoutubeLiveChat | ||
|
||
|
||
def test_parse_youtube_comment_buffer() -> None: | ||
text_content = """UgxtiXQkY7gqHbldJ1F4AaABAg,UCA6DtnbZ2KJckyTYfXOwQNA,2023-09-19T17:42:53.434647+00:00,0,,WtOskFeLmr4,"{""takeoutSegments"":[{""text"":""coalowl the legend""}]}" | ||
UgwDN8UeMxW4NDFbvY54AaABAg.9iwJkUYNcXa9u0lv3j3Abh,UCA6DtnbZ2KJckyTYfXOwQNA,2023-08-30T01:54:46.801024+00:00,0,UgwDN8UeMxW4NDFbvY54AaABAg,jH39c5-y6kg,"{""takeoutSegments"":[{""text"":""Ah, this is the reason why Ive never seen concurrent write failures myself, python's default timeout value is 5s, so it just waits in a busy loop if I have 'concurrent writers'""}]}" | ||
""" | ||
|
||
buf = StringIO(text_content) | ||
|
||
res = list(_parse_youtube_comments_buffer(buf, skip_first=False)) | ||
assert len(res) == 2 | ||
|
||
assert res[0] == CSVYoutubeComment( | ||
commentId="UgxtiXQkY7gqHbldJ1F4AaABAg", | ||
channelId="UCA6DtnbZ2KJckyTYfXOwQNA", | ||
dt=datetime(2023, 9, 19, 17, 42, 53, 434647, tzinfo=timezone.utc), | ||
price="0", | ||
parentCommentId=None, | ||
videoId="WtOskFeLmr4", | ||
contentJSON='{"takeoutSegments":[{"text":"coalowl the legend"}]}', | ||
) | ||
|
||
assert res[1] == CSVYoutubeComment( | ||
commentId="UgwDN8UeMxW4NDFbvY54AaABAg.9iwJkUYNcXa9u0lv3j3Abh", | ||
channelId="UCA6DtnbZ2KJckyTYfXOwQNA", | ||
dt=datetime(2023, 8, 30, 1, 54, 46, 801024, tzinfo=timezone.utc), | ||
price="0", | ||
parentCommentId="UgwDN8UeMxW4NDFbvY54AaABAg", | ||
videoId="jH39c5-y6kg", | ||
contentJSON='{"takeoutSegments":[{"text":"Ah, this is the reason why Ive never seen concurrent write failures myself, python\'s default timeout value is 5s, so it just waits in a busy loop if I have \'concurrent writers\'"}]}', | ||
) | ||
|
||
|
||
def test_parse_youtube_live_chat_buffer() -> None: | ||
text_content = """UgwsSD8yrDW7_h6F5vZ4AaABDqgB5OC1kgI,UCA6DtnbZ2KJckyTYfXOwQNA,2018-09-02T05:16:35.510381+00:00,0,0vGCh85obuI,"{""takeoutSegments"":[{""text"":""\""think the right thing\"" jeez""}]}" | ||
""" | ||
|
||
buf = StringIO(text_content) | ||
|
||
res = list(_parse_youtube_live_chats_buffer(buf, skip_first=False)) | ||
assert len(res) == 1 | ||
|
||
assert res[0] == CSVYoutubeLiveChat( | ||
liveChatId="UgwsSD8yrDW7_h6F5vZ4AaABDqgB5OC1kgI", | ||
channelId="UCA6DtnbZ2KJckyTYfXOwQNA", | ||
dt=datetime(2018, 9, 2, 5, 16, 35, 510381, tzinfo=timezone.utc), | ||
price="0", | ||
videoId="0vGCh85obuI", | ||
contentJSON='{"takeoutSegments":[{"text":"\"think the right thing\" jeez"}]}', | ||
) |