wip commit

this does not work yet, the lxml doesnt detect the outer-cell divs for some reason
purarue · Mar 20, 2023 · 09307da · 09307da
1 parent 770abed
commit 09307da
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 3 deletions.
diff --git a/benchmark/html_parsing.py b/benchmark/html_parsing.py
@@ -0,0 +1,24 @@
+"""
+basic script to parse HTML so its easier to benchmark in isolation
+"""
+
+import sys
+from pathlib import Path
+from google_takeout_parser.parse_html.activity import _parse_html_activity
+
+
+def main() -> None:
+    file = Path(sys.argv[1])
+    # print file size
+    print(f"File size: {file.stat().st_size}")
+    # just consume the whole generator
+    count = 0
+    for event in _parse_html_activity(Path(sys.argv[1]), stream=True):
+        assert not isinstance(event, Exception)
+        print(event)
+        count += 1
+    print(f"Number of items: {count}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/google_takeout_parser/parse_html/activity.py b/google_takeout_parser/parse_html/activity.py
@@ -10,6 +10,7 @@
 
 import bs4
 from bs4.element import Tag, PageElement
+from lxml import etree  # type: ignore[import]
 
 from ..models import Activity, Subtitles, Details, LocationInfo
 from ..common import Res
@@ -329,14 +330,58 @@ def _parse_activity_div(
     )
 
 
-def _parse_html_activity(p: Path) -> Iterator[Res[Activity]]:
-    file_dt = datetime.fromtimestamp(p.stat().st_mtime)
-    soup = bs4.BeautifulSoup(p.read_text(), "lxml")
+def _file_read_chunks(p: Path, chunk_size: int = 1024) -> Iterator[str]:
+    with p.open("r") as fo:
+        while True:
+            data = fo.read(chunk_size)
+            if not data:
+                break
+            yield data
+
+
+def _iter_activity_divs(p: Path) -> Iterator[str]:
+    """
+    memory efficient way to iterate over the .outer-cell divs in the HTML file
+    """
+    parser = etree.XMLPullParser(recover=True)
+    events = parser.read_events()
+
+    for chunk in _file_read_chunks(p):
+        parser.feed(chunk)
+        for event, elem in events:
+            print(event, elem.tag, elem.attrib)
+            if (
+                event == "end"
+                and elem.tag == "div"
+                and "outer-cell" in elem.attrib.get("class", "")
+            ):
+                data = etree.tostring(elem, encoding="utf-8").decode("utf-8")
+                yield data
+                # elem.clear(keep_tail=True)
+
+
+def _parse_activity_from_soup(
+    soup: bs4.BeautifulSoup, file_dt: datetime
+) -> Iterator[Res[Activity]]:
     for outer_div in soup.select("div.outer-cell"):
         try:
             yield _parse_activity_div(outer_div, file_dt=file_dt)
         except Exception as ae:
             yield ae
 
 
+def _parse_html_activity(p: Path, stream: bool = False) -> Iterator[Res[Activity]]:
+    file_dt = datetime.fromtimestamp(p.stat().st_mtime)
+    if stream:
+        for activity_div in _iter_activity_divs(p):
+            # TODO: ideally we wouldnt have to create another soup here?, but
+            # just using the div extracted from lxml leads to lots of errors
+            soup = bs4.BeautifulSoup(activity_div, "lxml")
+            yield from _parse_activity_from_soup(soup, file_dt=file_dt)
+    else:
+        with p.open("r") as fo:
+            soup = bs4.BeautifulSoup(fo, "lxml")
+            yield from _parse_activity_from_soup(soup, file_dt=file_dt)
+
+
 _parse_html_activity.return_type = Activity  # type: ignore[attr-defined]