From 3cf662c25ffb7c122c76ff0ad6faed8395549df9 Mon Sep 17 00:00:00 2001 From: frcroth Date: Mon, 18 Sep 2023 16:48:35 +0200 Subject: [PATCH] Add dockerfile for bistro om parser (#44) --- .github/workflows/main.yml | 1 + om-bistro/Dockerfile | 13 +++++ om-bistro/openmensa_server.py | 46 ++++++++++++++++ om-bistro/pdf_downloader.py | 27 +++++++++ om-bistro/pdf_parser.py | 101 ++++++++++++++++++++++++++++++++++ om-bistro/requirements.txt | 5 ++ 6 files changed, 193 insertions(+) create mode 100644 om-bistro/Dockerfile create mode 100644 om-bistro/openmensa_server.py create mode 100644 om-bistro/pdf_downloader.py create mode 100644 om-bistro/pdf_parser.py create mode 100644 om-bistro/requirements.txt diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9c6c447..db9544e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,6 +19,7 @@ jobs: - sbt - slurm-docker-cluster - wklink + - om-bistro runs-on: ubuntu-latest steps: diff --git a/om-bistro/Dockerfile b/om-bistro/Dockerfile new file mode 100644 index 0000000..5fecd3c --- /dev/null +++ b/om-bistro/Dockerfile @@ -0,0 +1,13 @@ + +FROM python:3.8-slim-buster + +WORKDIR /python-docker + +COPY requirements.txt requirements.txt +RUN pip3 install -r requirements.txt + +COPY . . + +ENV FLASK_APP openmensa_server + +CMD [ "python3", "-m" , "flask", "run", "--host=0.0.0.0"] diff --git a/om-bistro/openmensa_server.py b/om-bistro/openmensa_server.py new file mode 100644 index 0000000..5627ccf --- /dev/null +++ b/om-bistro/openmensa_server.py @@ -0,0 +1,46 @@ +import datetime + +from flask import Flask, make_response, Response + +from pdf_downloader import download_pdf +from pdf_parser import parse_pdf + +app = Flask(__name__) + +last_download = datetime.date.fromtimestamp(0) + + +@app.route("/feed") +def feed(): + global last_download + # limit downloads to once a day + if last_download is None or last_download != datetime.date.today(): + download_pdf() + last_download = datetime.date.today() + return parse_pdf() + + +@app.route("/meta") +def meta(): + xml = """ + + + Schraders Bistro +
August-Bebel-Straße 26–53 14482 Potsdam
+ public + + https://openmensa.scm.io/feed + + +
+
+ """ + return Response(xml, mimetype="text/xml") + + +@app.route("/health_check") +def health_check(): + return make_response("OK", 200) diff --git a/om-bistro/pdf_downloader.py b/om-bistro/pdf_downloader.py new file mode 100644 index 0000000..ef282b2 --- /dev/null +++ b/om-bistro/pdf_downloader.py @@ -0,0 +1,27 @@ +import datetime + +import requests + +# Reference: +# https://ein-anderes-mahl.de/wp-content/uploads/2023/04/Speisen-Schraders-11.09.-15.09.2023.pdf + + +def get_current_pdf(): + current_week_monday = datetime.date.today() - datetime.timedelta( + days=datetime.date.today().weekday() + ) + current_week_friday = current_week_monday + datetime.timedelta(days=4) + current_week_monday_string = current_week_monday.strftime("%d.%m.") + current_week_friday_string = current_week_friday.strftime("%d.%m.%Y") + return f"https://ein-anderes-mahl.de/wp-content/uploads/2023/04/Speisen-Schraders-{current_week_monday_string}-{current_week_friday_string}.pdf" + + +def download_pdf(): + pdf_url = get_current_pdf() + r = requests.get(pdf_url) + with open("downloaded.pdf", "wb") as f: + f.write(r.content) + + +if __name__ == "__main__": + download_pdf() diff --git a/om-bistro/pdf_parser.py b/om-bistro/pdf_parser.py new file mode 100644 index 0000000..b38d451 --- /dev/null +++ b/om-bistro/pdf_parser.py @@ -0,0 +1,101 @@ +import datetime +import re + +from dateutil.parser import parse +from pdfquery import PDFQuery +from pyopenmensa.feed import LazyBuilder + +days = ["Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag"] + + + +def get_text_until_next_day(pdf, day_label): + query = f"LTTextLineHorizontal:contains('{day_label}')" + day_element = pdf.pq(query)[0] + + def is_date_demarker(element): + text = element.getchildren()[0].text + for day in [ + *days, + "wechselnden Speisen", + ]: # "wechselnden Speisen" is a hacky delimiter for the last day + if day in text: + return True + return False + + current_element = day_element.getnext() + strings = [] + while not is_date_demarker(current_element): + for child in current_element.getchildren(): + strings.append(child.text) + current_element = current_element.getnext() + return strings + + +def filter_texts(texts): + price_regex = r"^\d,\d\d?$" + for text in texts: + text = text.strip() + if text == "": + continue + if re.fullmatch(price_regex, text): + continue + yield text + + +def parse_texts_into_meals(texts): + match = "" + for text in filter_texts(texts): + if text.startswith("- "): + if match != "": + yield match + match = "" + match += text[2:] + continue + elif match != "": + match += " " + text + if match != "": + yield match + + +def get_meals_for_day(pdf, day_label): + texts = get_text_until_next_day(pdf, day_label) + return list(parse_texts_into_meals(texts)) + + +def find_monday_date(pdf) -> datetime.date: + try: + date_string = ( + pdf.pq("LTTextLineHorizontal:contains('Mittagstisch vom')")[0] + .getchildren()[0] + .text + ) + daterx = r"\d\d\.\d\d" + first_date = re.findall(daterx, date_string)[0] + return parse(first_date).date() + except Exception: + print("Could not parse date") + + +def parse_pdf(): + pdf_path = "downloaded.pdf" + pdf = PDFQuery(pdf_path) + pdf.load() + + canteen = LazyBuilder() + canteen.name = "Schraders Bistro" + canteen.city = "Potsdam" + + current_day = find_monday_date(pdf) + + for day in days: + for i, food in enumerate(get_meals_for_day(pdf, day)): + category = "Fleischlos" if i == 0 else "Fleischlich" # Until now, vegetarian food is always first + canteen.addMeal(current_day, category, food) + current_day = current_day + datetime.timedelta(days=1) + + return canteen.toXMLFeed() + + +if __name__ == "__main__": + print(parse_pdf()) diff --git a/om-bistro/requirements.txt b/om-bistro/requirements.txt new file mode 100644 index 0000000..26303fd --- /dev/null +++ b/om-bistro/requirements.txt @@ -0,0 +1,5 @@ +pdfquery +pyopenmensa +python-dateutil +requests +flask \ No newline at end of file