Skip to content

Commit

Permalink
allowlist for http -> https, resolves #31 (#48)
Browse files Browse the repository at this point in the history
* allowlist for http -> https, resolves #31

* remove . prefix from google urls

* convert suffix list to list, no reason for set

* remove extra https check

* mention converted to what
  • Loading branch information
purarue authored Oct 1, 2023
1 parent 602ef23 commit 6df75ee
Show file tree
Hide file tree
Showing 7 changed files with 310 additions and 24 deletions.
238 changes: 238 additions & 0 deletions google_takeout_parser/http_allowlist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
import logging
from typing import Set, Optional, List

from .log import logger

from urllib.parse import urlsplit, urlunsplit

# exact matches
CONVERT_HTTP: Set[str] = {
"m.youtube.com",
"www.youtube.com",
"youtube.com",
"bp0.blogger.com",
}

# anything that ends with these domains
# curl -sL 'https://www.google.com/supported_domains
CONVERT_HTTP_SUFFIX: List[str] = [
"google.com",
"google.ad",
"google.ae",
"google.com.af",
"google.com.ag",
"google.al",
"google.am",
"google.co.ao",
"google.com.ar",
"google.as",
"google.at",
"google.com.au",
"google.az",
"google.ba",
"google.com.bd",
"google.be",
"google.bf",
"google.bg",
"google.com.bh",
"google.bi",
"google.bj",
"google.com.bn",
"google.com.bo",
"google.com.br",
"google.bs",
"google.bt",
"google.co.bw",
"google.by",
"google.com.bz",
"google.ca",
"google.cd",
"google.cf",
"google.cg",
"google.ch",
"google.ci",
"google.co.ck",
"google.cl",
"google.cm",
"google.cn",
"google.com.co",
"google.co.cr",
"google.com.cu",
"google.cv",
"google.com.cy",
"google.cz",
"google.de",
"google.dj",
"google.dk",
"google.dm",
"google.com.do",
"google.dz",
"google.com.ec",
"google.ee",
"google.com.eg",
"google.es",
"google.com.et",
"google.fi",
"google.com.fj",
"google.fm",
"google.fr",
"google.ga",
"google.ge",
"google.gg",
"google.com.gh",
"google.com.gi",
"google.gl",
"google.gm",
"google.gr",
"google.com.gt",
"google.gy",
"google.com.hk",
"google.hn",
"google.hr",
"google.ht",
"google.hu",
"google.co.id",
"google.ie",
"google.co.il",
"google.im",
"google.co.in",
"google.iq",
"google.is",
"google.it",
"google.je",
"google.com.jm",
"google.jo",
"google.co.jp",
"google.co.ke",
"google.com.kh",
"google.ki",
"google.kg",
"google.co.kr",
"google.com.kw",
"google.kz",
"google.la",
"google.com.lb",
"google.li",
"google.lk",
"google.co.ls",
"google.lt",
"google.lu",
"google.lv",
"google.com.ly",
"google.co.ma",
"google.md",
"google.me",
"google.mg",
"google.mk",
"google.ml",
"google.com.mm",
"google.mn",
"google.com.mt",
"google.mu",
"google.mv",
"google.mw",
"google.com.mx",
"google.com.my",
"google.co.mz",
"google.com.na",
"google.com.ng",
"google.com.ni",
"google.ne",
"google.nl",
"google.no",
"google.com.np",
"google.nr",
"google.nu",
"google.co.nz",
"google.com.om",
"google.com.pa",
"google.com.pe",
"google.com.pg",
"google.com.ph",
"google.com.pk",
"google.pl",
"google.pn",
"google.com.pr",
"google.ps",
"google.pt",
"google.com.py",
"google.com.qa",
"google.ro",
"google.ru",
"google.rw",
"google.com.sa",
"google.com.sb",
"google.sc",
"google.se",
"google.com.sg",
"google.sh",
"google.si",
"google.sk",
"google.com.sl",
"google.sn",
"google.so",
"google.sm",
"google.sr",
"google.st",
"google.com.sv",
"google.td",
"google.tg",
"google.co.th",
"google.com.tj",
"google.tl",
"google.tm",
"google.tn",
"google.to",
"google.com.tr",
"google.tt",
"google.com.tw",
"google.co.tz",
"google.com.ua",
"google.co.ug",
"google.co.uk",
"google.com.uy",
"google.co.uz",
"google.com.vc",
"google.co.ve",
"google.co.vi",
"google.com.vn",
"google.vu",
"google.ws",
"google.rs",
"google.co.za",
"google.co.zm",
"google.co.zw",
"google.cat",
]


def _convert_to_https(url: str, logger: Optional[logging.Logger] = None) -> str:
uu = urlsplit(url)
if uu.scheme == "http":
if uu.netloc in CONVERT_HTTP:
return urlunsplit(("https",) + uu[1:])
if any(uu.netloc.endswith(suffix) for suffix in CONVERT_HTTP_SUFFIX):
return urlunsplit(("https",) + uu[1:])
if logger:
logger.debug(
"HTTP URL did not match allowlist: %s\nIf you think this should be auto-converted to HTTPS, make an issue here: https://github.com/seanbreckenridge/google_takeout_parser/issues/new",
url,
)
# some other scheme, just return
return url


def _convert_to_https_opt(
url: Optional[str], logger: Optional[logging.Logger] = None
) -> Optional[str]:
if url is None:
return None
return _convert_to_https(url, logger)


def convert_to_https(url: str) -> str:
return _convert_to_https(url, logger=logger)


def convert_to_https_opt(url: Optional[str]) -> Optional[str]:
return _convert_to_https_opt(url, logger=logger)
14 changes: 8 additions & 6 deletions google_takeout_parser/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from .common import Res
from .log import logger

Url = str


def get_union_args(cls: Any) -> Optional[Tuple[Type]]: # type: ignore[type-arg]
if getattr(cls, "__origin__", None) != Union:
Expand All @@ -37,14 +39,14 @@ def get_union_args(cls: Any) -> Optional[Tuple[Type]]: # type: ignore[type-arg]

class Subtitles(NamedTuple):
name: str
url: Optional[str]
url: Optional[Url]


class LocationInfo(NamedTuple):
name: Optional[str]
url: Optional[str]
url: Optional[Url]
source: Optional[str]
sourceUrl: Optional[str]
sourceUrl: Optional[Url]


class BaseEvent(Protocol):
Expand All @@ -59,7 +61,7 @@ class Activity(BaseEvent):
title: str
time: datetime
description: Optional[str]
titleUrl: Optional[str]
titleUrl: Optional[Url]
# note: in HTML exports, there is no way to tell the difference between
# a description and a subtitle, so they end up as subtitles
# more lines of text describing this
Expand All @@ -85,7 +87,7 @@ def key(self) -> Tuple[str, str, int]:
class YoutubeComment(BaseEvent):
content: str
dt: datetime
urls: List[str]
urls: List[Url]

@property
def key(self) -> int:
Expand Down Expand Up @@ -198,7 +200,7 @@ def otherCandidateLocations(self) -> List[CandidateLocation]:
@dataclass
class ChromeHistory(BaseEvent):
title: str
url: str
url: Url
dt: datetime

@property
Expand Down
19 changes: 11 additions & 8 deletions google_takeout_parser/parse_html/activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
Parses the HTML MyActivity.html files that used to be the standard
"""

import warnings
from pathlib import Path
from datetime import datetime
from typing import List, Iterator, Optional, Tuple, Union, Dict, Iterable
Expand All @@ -14,6 +13,7 @@
from ..models import Activity, Subtitles, LocationInfo
from ..common import Res
from ..log import logger
from ..http_allowlist import convert_to_https_opt
from .html_time_utils import parse_html_dt


Expand Down Expand Up @@ -90,11 +90,13 @@ def _parse_subtitles(
if "href" in tag.attrs:
url = tag.attrs["href"]
else:
warnings.warn(f"Unexpected tag! {tag}")
logger.warning(f"Unexpected tag! {tag}")
else:
raise RuntimeError(f"Unexpected Type {tag} {type(tag)}")

parsed_subs.append(Subtitles(name=clean_latin1_chars(buf), url=url))
parsed_subs.append(
Subtitles(name=clean_latin1_chars(buf), url=convert_to_https_opt(url))
)

return parsed_subs, parse_html_dt(dt_raw, file_dt=file_dt)

Expand Down Expand Up @@ -239,16 +241,16 @@ def _parse_caption(
locationInfos.append(
LocationInfo(
name=name,
url=url,
url=convert_to_https_opt(url),
source=source,
sourceUrl=sourceUrl,
sourceUrl=convert_to_https_opt(sourceUrl),
)
)
elif header == "Details:":
details.append(str(clean_latin1_chars(str(value[0])).strip()))

else:
warnings.warn(f"Unexpected header in caption {header} {value}")
logger.warning(f"Unexpected header in caption {header} {value}")

return details, locationInfos, products

Expand Down Expand Up @@ -318,8 +320,9 @@ def _parse_activity_div(

return Activity(
header=header,
title=title_info[0],
titleUrl=title_info[1], # could be None, matched by model
title=title_info.name,
# could be None, matched the JSON format
titleUrl=convert_to_https_opt(title_info.url),
description=None, # always none since we can't differentiate in HTML parsing
time=dtime,
locationInfos=locationInfos,
Expand Down
10 changes: 7 additions & 3 deletions google_takeout_parser/parse_html/comment.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import re
from pathlib import Path
from typing import Iterator
from typing import Iterator, List
from datetime import datetime, timezone

import bs4

from ..models import YoutubeComment
from ..common import Res
from ..http_allowlist import convert_to_https
from .activity import _group_by_brs, clean_latin1_chars

# seems to always be in UTC?
Expand Down Expand Up @@ -45,7 +46,10 @@ def _parse_html_li(li: bs4.element.Tag) -> YoutubeComment:
desc += str(tag)
elif isinstance(tag, bs4.element.Tag):
desc += str(tag.text)
urls = list({link.attrs["href"] for link in li.select("a") if "href" in link.attrs})
urls: List[str] = []
for link in li.select("a"):
if "href" in link.attrs:
urls.append(convert_to_https(link.attrs["href"]))
return YoutubeComment(
content=clean_latin1_chars(desc).strip(), urls=urls, dt=parsed_date
)
Expand All @@ -70,5 +74,5 @@ def test_parse_html_comment_file() -> None:
assert parsed_li == YoutubeComment(
content="content here",
dt=datetime(2020, 4, 27, 23, 18, 23, tzinfo=timezone.utc),
urls=["http://www.youtube.com/watch?v=mM"],
urls=["https://www.youtube.com/watch?v=mM"],
)
Loading

0 comments on commit 6df75ee

Please sign in to comment.