Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dockerfile for bistro om parser #44

Merged
merged 4 commits into from
Sep 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ jobs:
- sbt
- slurm-docker-cluster
- wklink
- om-bistro
runs-on: ubuntu-latest

steps:
Expand Down
13 changes: 13 additions & 0 deletions om-bistro/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

FROM python:3.8-slim-buster

WORKDIR /python-docker

COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt

COPY . .

ENV FLASK_APP openmensa_server

CMD [ "python3", "-m" , "flask", "run", "--host=0.0.0.0"]
46 changes: 46 additions & 0 deletions om-bistro/openmensa_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import datetime

from flask import Flask, make_response, Response

from pdf_downloader import download_pdf
from pdf_parser import parse_pdf

app = Flask(__name__)

last_download = datetime.date.fromtimestamp(0)


@app.route("/feed")
def feed():
global last_download
# limit downloads to once a day
if last_download is None or last_download != datetime.date.today():
download_pdf()
last_download = datetime.date.today()
return parse_pdf()


@app.route("/meta")
def meta():
xml = """
<openmensa version="2.1"
xmlns="http://openmensa.org/open-mensa-v2"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://openmensa.org/open-mensa-v2 http://openmensa.org/open-mensa-v2.xsd">
<canteen>
<name>Schraders Bistro</name>
<address>August-Bebel-Straße 26–53 14482 Potsdam</address>
<availability>public</availability>
<feed name="full">
<url>https://openmensa.scm.io/feed</url>
<schedule dayOfMonth="*" dayOfWeek="1" hour="8"/>
</feed>
</canteen>
</openmensa>
"""
return Response(xml, mimetype="text/xml")


@app.route("/health_check")
def health_check():
return make_response("OK", 200)
27 changes: 27 additions & 0 deletions om-bistro/pdf_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import datetime

import requests

# Reference:
# https://ein-anderes-mahl.de/wp-content/uploads/2023/04/Speisen-Schraders-11.09.-15.09.2023.pdf


def get_current_pdf():
current_week_monday = datetime.date.today() - datetime.timedelta(
days=datetime.date.today().weekday()
)
current_week_friday = current_week_monday + datetime.timedelta(days=4)
current_week_monday_string = current_week_monday.strftime("%d.%m.")
current_week_friday_string = current_week_friday.strftime("%d.%m.%Y")
return f"https://ein-anderes-mahl.de/wp-content/uploads/2023/04/Speisen-Schraders-{current_week_monday_string}-{current_week_friday_string}.pdf"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's hope the URL stays stable!



def download_pdf():
pdf_url = get_current_pdf()
r = requests.get(pdf_url)
with open("downloaded.pdf", "wb") as f:
f.write(r.content)


if __name__ == "__main__":
download_pdf()
101 changes: 101 additions & 0 deletions om-bistro/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import datetime
import re

from dateutil.parser import parse
from pdfquery import PDFQuery
from pyopenmensa.feed import LazyBuilder

days = ["Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag"]



def get_text_until_next_day(pdf, day_label):
query = f"LTTextLineHorizontal:contains('{day_label}')"
day_element = pdf.pq(query)[0]

def is_date_demarker(element):
text = element.getchildren()[0].text
for day in [
*days,
"wechselnden Speisen",
]: # "wechselnden Speisen" is a hacky delimiter for the last day
if day in text:
return True
return False

current_element = day_element.getnext()
strings = []
while not is_date_demarker(current_element):
for child in current_element.getchildren():
strings.append(child.text)
current_element = current_element.getnext()
return strings


def filter_texts(texts):
price_regex = r"^\d,\d\d?$"
for text in texts:
text = text.strip()
if text == "":
continue
if re.fullmatch(price_regex, text):
continue
yield text


def parse_texts_into_meals(texts):
match = ""
for text in filter_texts(texts):
if text.startswith("- "):
if match != "":
yield match
match = ""
match += text[2:]
continue
elif match != "":
match += " " + text
if match != "":
yield match


def get_meals_for_day(pdf, day_label):
texts = get_text_until_next_day(pdf, day_label)
return list(parse_texts_into_meals(texts))


def find_monday_date(pdf) -> datetime.date:
try:
date_string = (
pdf.pq("LTTextLineHorizontal:contains('Mittagstisch vom')")[0]
.getchildren()[0]
.text
)
daterx = r"\d\d\.\d\d"
first_date = re.findall(daterx, date_string)[0]
return parse(first_date).date()
except Exception:
print("Could not parse date")


def parse_pdf():
pdf_path = "downloaded.pdf"
pdf = PDFQuery(pdf_path)
pdf.load()

canteen = LazyBuilder()
canteen.name = "Schraders Bistro"
canteen.city = "Potsdam"

current_day = find_monday_date(pdf)

for day in days:
for i, food in enumerate(get_meals_for_day(pdf, day)):
category = "Fleischlos" if i == 0 else "Fleischlich" # Until now, vegetarian food is always first
canteen.addMeal(current_day, category, food)
current_day = current_day + datetime.timedelta(days=1)

return canteen.toXMLFeed()


if __name__ == "__main__":
print(parse_pdf())
5 changes: 5 additions & 0 deletions om-bistro/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pdfquery
pyopenmensa
python-dateutil
requests
flask
Loading