Add dockerfile for bistro om parser (#44)

scalableminds · Sep 18, 2023 · 3cf662c · 3cf662c
1 parent cd51e88
commit 3cf662c
Show file tree

Hide file tree

Showing 6 changed files with 193 additions and 0 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -19,6 +19,7 @@ jobs:
           - sbt
           - slurm-docker-cluster
           - wklink
+          - om-bistro
     runs-on: ubuntu-latest
 
     steps:

diff --git a/om-bistro/Dockerfile b/om-bistro/Dockerfile
@@ -0,0 +1,13 @@
+
+FROM python:3.8-slim-buster
+
+WORKDIR /python-docker
+
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+
+COPY . .
+
+ENV FLASK_APP openmensa_server
+
+CMD [ "python3", "-m" , "flask", "run", "--host=0.0.0.0"]
diff --git a/om-bistro/openmensa_server.py b/om-bistro/openmensa_server.py
@@ -0,0 +1,46 @@
+import datetime
+
+from flask import Flask, make_response, Response
+
+from pdf_downloader import download_pdf
+from pdf_parser import parse_pdf
+
+app = Flask(__name__)
+
+last_download = datetime.date.fromtimestamp(0)
+
+
+@app.route("/feed")
+def feed():
+    global last_download
+    # limit downloads to once a day
+    if last_download is None or last_download != datetime.date.today():
+        download_pdf()
+        last_download = datetime.date.today()
+    return parse_pdf()
+
+
+@app.route("/meta")
+def meta():
+    xml = """
+        <openmensa version="2.1"
+           xmlns="http://openmensa.org/open-mensa-v2"
+           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+           xsi:schemaLocation="http://openmensa.org/open-mensa-v2 http://openmensa.org/open-mensa-v2.xsd">
+    <canteen>
+    <name>Schraders Bistro</name>
+    <address>August-Bebel-Straße 26–53 14482 Potsdam</address>
+    <availability>public</availability>
+    <feed name="full">
+        <url>https://openmensa.scm.io/feed</url>
+        <schedule dayOfMonth="*" dayOfWeek="1" hour="8"/>
+    </feed>
+    </canteen>
+    </openmensa>
+    """
+    return Response(xml, mimetype="text/xml")
+
+
+@app.route("/health_check")
+def health_check():
+    return make_response("OK", 200)
diff --git a/om-bistro/pdf_downloader.py b/om-bistro/pdf_downloader.py
@@ -0,0 +1,27 @@
+import datetime
+
+import requests
+
+# Reference:
+# https://ein-anderes-mahl.de/wp-content/uploads/2023/04/Speisen-Schraders-11.09.-15.09.2023.pdf
+
+
+def get_current_pdf():
+    current_week_monday = datetime.date.today() - datetime.timedelta(
+        days=datetime.date.today().weekday()
+    )
+    current_week_friday = current_week_monday + datetime.timedelta(days=4)
+    current_week_monday_string = current_week_monday.strftime("%d.%m.")
+    current_week_friday_string = current_week_friday.strftime("%d.%m.%Y")
+    return f"https://ein-anderes-mahl.de/wp-content/uploads/2023/04/Speisen-Schraders-{current_week_monday_string}-{current_week_friday_string}.pdf"
+
+
+def download_pdf():
+    pdf_url = get_current_pdf()
+    r = requests.get(pdf_url)
+    with open("downloaded.pdf", "wb") as f:
+        f.write(r.content)
+
+
+if __name__ == "__main__":
+    download_pdf()
diff --git a/om-bistro/pdf_parser.py b/om-bistro/pdf_parser.py
@@ -0,0 +1,101 @@
+import datetime
+import re
+
+from dateutil.parser import parse
+from pdfquery import PDFQuery
+from pyopenmensa.feed import LazyBuilder
+
+days = ["Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag"]
+
+
+
+def get_text_until_next_day(pdf, day_label):
+    query = f"LTTextLineHorizontal:contains('{day_label}')"
+    day_element = pdf.pq(query)[0]
+
+    def is_date_demarker(element):
+        text = element.getchildren()[0].text
+        for day in [
+            *days,
+            "wechselnden Speisen",
+        ]:  # "wechselnden Speisen" is a hacky delimiter for the last day
+            if day in text:
+                return True
+        return False
+
+    current_element = day_element.getnext()
+    strings = []
+    while not is_date_demarker(current_element):
+        for child in current_element.getchildren():
+            strings.append(child.text)
+        current_element = current_element.getnext()
+    return strings
+
+
+def filter_texts(texts):
+    price_regex = r"^\d,\d\d?$"
+    for text in texts:
+        text = text.strip()
+        if text == "":
+            continue
+        if re.fullmatch(price_regex, text):
+            continue
+        yield text
+
+
+def parse_texts_into_meals(texts):
+    match = ""
+    for text in filter_texts(texts):
+        if text.startswith("- "):
+            if match != "":
+                yield match
+                match = ""
+            match += text[2:]
+            continue
+        elif match != "":
+            match += " " + text
+    if match != "":
+        yield match
+
+
+def get_meals_for_day(pdf, day_label):
+    texts = get_text_until_next_day(pdf, day_label)
+    return list(parse_texts_into_meals(texts))
+
+
+def find_monday_date(pdf) -> datetime.date:
+    try:
+        date_string = (
+            pdf.pq("LTTextLineHorizontal:contains('Mittagstisch vom')")[0]
+            .getchildren()[0]
+            .text
+        )
+        daterx = r"\d\d\.\d\d"
+        first_date = re.findall(daterx, date_string)[0]
+        return parse(first_date).date()
+    except Exception:
+        print("Could not parse date")
+
+
+def parse_pdf():
+    pdf_path = "downloaded.pdf"
+    pdf = PDFQuery(pdf_path)
+    pdf.load()
+
+    canteen = LazyBuilder()
+    canteen.name = "Schraders Bistro"
+    canteen.city = "Potsdam"
+
+    current_day = find_monday_date(pdf)
+
+    for day in days:
+        for i, food in enumerate(get_meals_for_day(pdf, day)):
+            category = "Fleischlos" if i == 0 else "Fleischlich" # Until now, vegetarian food is always first
+            canteen.addMeal(current_day, category, food)
+        current_day = current_day + datetime.timedelta(days=1)
+
+    return canteen.toXMLFeed()
+
+
+if __name__ == "__main__":
+    print(parse_pdf())
diff --git a/om-bistro/requirements.txt b/om-bistro/requirements.txt
@@ -0,0 +1,5 @@
+pdfquery
+pyopenmensa
+python-dateutil
+requests
+flask