Skip to content

Commit

Permalink
wip commit
Browse files Browse the repository at this point in the history
this does not work yet, the lxml doesnt detect the outer-cell
divs for some reason
  • Loading branch information
purarue committed Mar 20, 2023
1 parent 770abed commit 09307da
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 3 deletions.
24 changes: 24 additions & 0 deletions benchmark/html_parsing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
basic script to parse HTML so its easier to benchmark in isolation
"""

import sys
from pathlib import Path
from google_takeout_parser.parse_html.activity import _parse_html_activity


def main() -> None:
file = Path(sys.argv[1])
# print file size
print(f"File size: {file.stat().st_size}")
# just consume the whole generator
count = 0
for event in _parse_html_activity(Path(sys.argv[1]), stream=True):
assert not isinstance(event, Exception)
print(event)
count += 1
print(f"Number of items: {count}")


if __name__ == "__main__":
main()
51 changes: 48 additions & 3 deletions google_takeout_parser/parse_html/activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import bs4
from bs4.element import Tag, PageElement
from lxml import etree # type: ignore[import]

from ..models import Activity, Subtitles, Details, LocationInfo
from ..common import Res
Expand Down Expand Up @@ -329,14 +330,58 @@ def _parse_activity_div(
)


def _parse_html_activity(p: Path) -> Iterator[Res[Activity]]:
file_dt = datetime.fromtimestamp(p.stat().st_mtime)
soup = bs4.BeautifulSoup(p.read_text(), "lxml")
def _file_read_chunks(p: Path, chunk_size: int = 1024) -> Iterator[str]:
with p.open("r") as fo:
while True:
data = fo.read(chunk_size)
if not data:
break
yield data


def _iter_activity_divs(p: Path) -> Iterator[str]:
"""
memory efficient way to iterate over the .outer-cell divs in the HTML file
"""
parser = etree.XMLPullParser(recover=True)
events = parser.read_events()

for chunk in _file_read_chunks(p):
parser.feed(chunk)
for event, elem in events:
print(event, elem.tag, elem.attrib)
if (
event == "end"
and elem.tag == "div"
and "outer-cell" in elem.attrib.get("class", "")
):
data = etree.tostring(elem, encoding="utf-8").decode("utf-8")
yield data
# elem.clear(keep_tail=True)


def _parse_activity_from_soup(
soup: bs4.BeautifulSoup, file_dt: datetime
) -> Iterator[Res[Activity]]:
for outer_div in soup.select("div.outer-cell"):
try:
yield _parse_activity_div(outer_div, file_dt=file_dt)
except Exception as ae:
yield ae


def _parse_html_activity(p: Path, stream: bool = False) -> Iterator[Res[Activity]]:
file_dt = datetime.fromtimestamp(p.stat().st_mtime)
if stream:
for activity_div in _iter_activity_divs(p):
# TODO: ideally we wouldnt have to create another soup here?, but
# just using the div extracted from lxml leads to lots of errors
soup = bs4.BeautifulSoup(activity_div, "lxml")
yield from _parse_activity_from_soup(soup, file_dt=file_dt)
else:
with p.open("r") as fo:
soup = bs4.BeautifulSoup(fo, "lxml")
yield from _parse_activity_from_soup(soup, file_dt=file_dt)


_parse_html_activity.return_type = Activity # type: ignore[attr-defined]

0 comments on commit 09307da

Please sign in to comment.