allowlist for http -> https, resolves #31 (#48)

* allowlist for http -> https, resolves #31 * remove . prefix from google urls * convert suffix list to list, no reason for set * remove extra https check * mention converted to what
purarue · Oct 1, 2023 · 6df75ee · 6df75ee
1 parent 602ef23
commit 6df75ee
Show file tree

Hide file tree

Showing 7 changed files with 310 additions and 24 deletions.
diff --git a/google_takeout_parser/http_allowlist.py b/google_takeout_parser/http_allowlist.py
@@ -0,0 +1,238 @@
+import logging
+from typing import Set, Optional, List
+
+from .log import logger
+
+from urllib.parse import urlsplit, urlunsplit
+
+# exact matches
+CONVERT_HTTP: Set[str] = {
+    "m.youtube.com",
+    "www.youtube.com",
+    "youtube.com",
+    "bp0.blogger.com",
+}
+
+# anything that ends with these domains
+# curl -sL 'https://www.google.com/supported_domains
+CONVERT_HTTP_SUFFIX: List[str] = [
+    "google.com",
+    "google.ad",
+    "google.ae",
+    "google.com.af",
+    "google.com.ag",
+    "google.al",
+    "google.am",
+    "google.co.ao",
+    "google.com.ar",
+    "google.as",
+    "google.at",
+    "google.com.au",
+    "google.az",
+    "google.ba",
+    "google.com.bd",
+    "google.be",
+    "google.bf",
+    "google.bg",
+    "google.com.bh",
+    "google.bi",
+    "google.bj",
+    "google.com.bn",
+    "google.com.bo",
+    "google.com.br",
+    "google.bs",
+    "google.bt",
+    "google.co.bw",
+    "google.by",
+    "google.com.bz",
+    "google.ca",
+    "google.cd",
+    "google.cf",
+    "google.cg",
+    "google.ch",
+    "google.ci",
+    "google.co.ck",
+    "google.cl",
+    "google.cm",
+    "google.cn",
+    "google.com.co",
+    "google.co.cr",
+    "google.com.cu",
+    "google.cv",
+    "google.com.cy",
+    "google.cz",
+    "google.de",
+    "google.dj",
+    "google.dk",
+    "google.dm",
+    "google.com.do",
+    "google.dz",
+    "google.com.ec",
+    "google.ee",
+    "google.com.eg",
+    "google.es",
+    "google.com.et",
+    "google.fi",
+    "google.com.fj",
+    "google.fm",
+    "google.fr",
+    "google.ga",
+    "google.ge",
+    "google.gg",
+    "google.com.gh",
+    "google.com.gi",
+    "google.gl",
+    "google.gm",
+    "google.gr",
+    "google.com.gt",
+    "google.gy",
+    "google.com.hk",
+    "google.hn",
+    "google.hr",
+    "google.ht",
+    "google.hu",
+    "google.co.id",
+    "google.ie",
+    "google.co.il",
+    "google.im",
+    "google.co.in",
+    "google.iq",
+    "google.is",
+    "google.it",
+    "google.je",
+    "google.com.jm",
+    "google.jo",
+    "google.co.jp",
+    "google.co.ke",
+    "google.com.kh",
+    "google.ki",
+    "google.kg",
+    "google.co.kr",
+    "google.com.kw",
+    "google.kz",
+    "google.la",
+    "google.com.lb",
+    "google.li",
+    "google.lk",
+    "google.co.ls",
+    "google.lt",
+    "google.lu",
+    "google.lv",
+    "google.com.ly",
+    "google.co.ma",
+    "google.md",
+    "google.me",
+    "google.mg",
+    "google.mk",
+    "google.ml",
+    "google.com.mm",
+    "google.mn",
+    "google.com.mt",
+    "google.mu",
+    "google.mv",
+    "google.mw",
+    "google.com.mx",
+    "google.com.my",
+    "google.co.mz",
+    "google.com.na",
+    "google.com.ng",
+    "google.com.ni",
+    "google.ne",
+    "google.nl",
+    "google.no",
+    "google.com.np",
+    "google.nr",
+    "google.nu",
+    "google.co.nz",
+    "google.com.om",
+    "google.com.pa",
+    "google.com.pe",
+    "google.com.pg",
+    "google.com.ph",
+    "google.com.pk",
+    "google.pl",
+    "google.pn",
+    "google.com.pr",
+    "google.ps",
+    "google.pt",
+    "google.com.py",
+    "google.com.qa",
+    "google.ro",
+    "google.ru",
+    "google.rw",
+    "google.com.sa",
+    "google.com.sb",
+    "google.sc",
+    "google.se",
+    "google.com.sg",
+    "google.sh",
+    "google.si",
+    "google.sk",
+    "google.com.sl",
+    "google.sn",
+    "google.so",
+    "google.sm",
+    "google.sr",
+    "google.st",
+    "google.com.sv",
+    "google.td",
+    "google.tg",
+    "google.co.th",
+    "google.com.tj",
+    "google.tl",
+    "google.tm",
+    "google.tn",
+    "google.to",
+    "google.com.tr",
+    "google.tt",
+    "google.com.tw",
+    "google.co.tz",
+    "google.com.ua",
+    "google.co.ug",
+    "google.co.uk",
+    "google.com.uy",
+    "google.co.uz",
+    "google.com.vc",
+    "google.co.ve",
+    "google.co.vi",
+    "google.com.vn",
+    "google.vu",
+    "google.ws",
+    "google.rs",
+    "google.co.za",
+    "google.co.zm",
+    "google.co.zw",
+    "google.cat",
+]
+
+
+def _convert_to_https(url: str, logger: Optional[logging.Logger] = None) -> str:
+    uu = urlsplit(url)
+    if uu.scheme == "http":
+        if uu.netloc in CONVERT_HTTP:
+            return urlunsplit(("https",) + uu[1:])
+        if any(uu.netloc.endswith(suffix) for suffix in CONVERT_HTTP_SUFFIX):
+            return urlunsplit(("https",) + uu[1:])
+        if logger:
+            logger.debug(
+                "HTTP URL did not match allowlist: %s\nIf you think this should be auto-converted to HTTPS, make an issue here: https://github.com/seanbreckenridge/google_takeout_parser/issues/new",
+                url,
+            )
+    # some other scheme, just return
+    return url
+
+
+def _convert_to_https_opt(
+    url: Optional[str], logger: Optional[logging.Logger] = None
+) -> Optional[str]:
+    if url is None:
+        return None
+    return _convert_to_https(url, logger)
+
+
+def convert_to_https(url: str) -> str:
+    return _convert_to_https(url, logger=logger)
+
+
+def convert_to_https_opt(url: Optional[str]) -> Optional[str]:
+    return _convert_to_https_opt(url, logger=logger)
diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py
@@ -24,6 +24,8 @@
 from .common import Res
 from .log import logger
 
+Url = str
+
 
 def get_union_args(cls: Any) -> Optional[Tuple[Type]]:  # type: ignore[type-arg]
     if getattr(cls, "__origin__", None) != Union:
@@ -37,14 +39,14 @@ def get_union_args(cls: Any) -> Optional[Tuple[Type]]:  # type: ignore[type-arg]
 
 class Subtitles(NamedTuple):
     name: str
-    url: Optional[str]
+    url: Optional[Url]
 
 
 class LocationInfo(NamedTuple):
     name: Optional[str]
-    url: Optional[str]
+    url: Optional[Url]
     source: Optional[str]
-    sourceUrl: Optional[str]
+    sourceUrl: Optional[Url]
 
 
 class BaseEvent(Protocol):
@@ -59,7 +61,7 @@ class Activity(BaseEvent):
     title: str
     time: datetime
     description: Optional[str]
-    titleUrl: Optional[str]
+    titleUrl: Optional[Url]
     # note: in HTML exports, there is no way to tell the difference between
     # a description and a subtitle, so they end up as subtitles
     # more lines of text describing this
@@ -85,7 +87,7 @@ def key(self) -> Tuple[str, str, int]:
 class YoutubeComment(BaseEvent):
     content: str
     dt: datetime
-    urls: List[str]
+    urls: List[Url]
 
     @property
     def key(self) -> int:
@@ -198,7 +200,7 @@ def otherCandidateLocations(self) -> List[CandidateLocation]:
 @dataclass
 class ChromeHistory(BaseEvent):
     title: str
-    url: str
+    url: Url
     dt: datetime
 
     @property

diff --git a/google_takeout_parser/parse_html/activity.py b/google_takeout_parser/parse_html/activity.py
@@ -2,7 +2,6 @@
 Parses the HTML MyActivity.html files that used to be the standard
 """
 
-import warnings
 from pathlib import Path
 from datetime import datetime
 from typing import List, Iterator, Optional, Tuple, Union, Dict, Iterable
@@ -14,6 +13,7 @@
 from ..models import Activity, Subtitles, LocationInfo
 from ..common import Res
 from ..log import logger
+from ..http_allowlist import convert_to_https_opt
 from .html_time_utils import parse_html_dt
 
 
@@ -90,11 +90,13 @@ def _parse_subtitles(
                     if "href" in tag.attrs:
                         url = tag.attrs["href"]
                 else:
-                    warnings.warn(f"Unexpected tag! {tag}")
+                    logger.warning(f"Unexpected tag! {tag}")
             else:
                 raise RuntimeError(f"Unexpected Type {tag} {type(tag)}")
 
-        parsed_subs.append(Subtitles(name=clean_latin1_chars(buf), url=url))
+        parsed_subs.append(
+            Subtitles(name=clean_latin1_chars(buf), url=convert_to_https_opt(url))
+        )
 
     return parsed_subs, parse_html_dt(dt_raw, file_dt=file_dt)
 
@@ -239,16 +241,16 @@ def _parse_caption(
                 locationInfos.append(
                     LocationInfo(
                         name=name,
-                        url=url,
+                        url=convert_to_https_opt(url),
                         source=source,
-                        sourceUrl=sourceUrl,
+                        sourceUrl=convert_to_https_opt(sourceUrl),
                     )
                 )
             elif header == "Details:":
                 details.append(str(clean_latin1_chars(str(value[0])).strip()))
 
             else:
-                warnings.warn(f"Unexpected header in caption {header} {value}")
+                logger.warning(f"Unexpected header in caption {header} {value}")
 
     return details, locationInfos, products
 
@@ -318,8 +320,9 @@ def _parse_activity_div(
 
     return Activity(
         header=header,
-        title=title_info[0],
-        titleUrl=title_info[1],  # could be None, matched by model
+        title=title_info.name,
+        # could be None, matched the JSON format
+        titleUrl=convert_to_https_opt(title_info.url),
         description=None,  # always none since we can't differentiate in HTML parsing
         time=dtime,
         locationInfos=locationInfos,

diff --git a/google_takeout_parser/parse_html/comment.py b/google_takeout_parser/parse_html/comment.py
@@ -1,12 +1,13 @@
 import re
 from pathlib import Path
-from typing import Iterator
+from typing import Iterator, List
 from datetime import datetime, timezone
 
 import bs4
 
 from ..models import YoutubeComment
 from ..common import Res
+from ..http_allowlist import convert_to_https
 from .activity import _group_by_brs, clean_latin1_chars
 
 # seems to always be in UTC?
@@ -45,7 +46,10 @@ def _parse_html_li(li: bs4.element.Tag) -> YoutubeComment:
             desc += str(tag)
         elif isinstance(tag, bs4.element.Tag):
             desc += str(tag.text)
-    urls = list({link.attrs["href"] for link in li.select("a") if "href" in link.attrs})
+    urls: List[str] = []
+    for link in li.select("a"):
+        if "href" in link.attrs:
+            urls.append(convert_to_https(link.attrs["href"]))
     return YoutubeComment(
         content=clean_latin1_chars(desc).strip(), urls=urls, dt=parsed_date
     )
@@ -70,5 +74,5 @@ def test_parse_html_comment_file() -> None:
     assert parsed_li == YoutubeComment(
         content="content here",
         dt=datetime(2020, 4, 27, 23, 18, 23, tzinfo=timezone.utc),
-        urls=["http://www.youtube.com/watch?v=mM"],
+        urls=["https://www.youtube.com/watch?v=mM"],
     )