Skip to content

Commit

Permalink
Add dockerfile for bistro om parser (#44)
Browse files Browse the repository at this point in the history
  • Loading branch information
frcroth committed Sep 18, 2023
1 parent cd51e88 commit 3cf662c
Show file tree
Hide file tree
Showing 6 changed files with 193 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ jobs:
- sbt
- slurm-docker-cluster
- wklink
- om-bistro
runs-on: ubuntu-latest

steps:
Expand Down
13 changes: 13 additions & 0 deletions om-bistro/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

FROM python:3.8-slim-buster

WORKDIR /python-docker

COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt

COPY . .

ENV FLASK_APP openmensa_server

CMD [ "python3", "-m" , "flask", "run", "--host=0.0.0.0"]
46 changes: 46 additions & 0 deletions om-bistro/openmensa_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import datetime

from flask import Flask, make_response, Response

from pdf_downloader import download_pdf
from pdf_parser import parse_pdf

app = Flask(__name__)

last_download = datetime.date.fromtimestamp(0)


@app.route("/feed")
def feed():
global last_download
# limit downloads to once a day
if last_download is None or last_download != datetime.date.today():
download_pdf()
last_download = datetime.date.today()
return parse_pdf()


@app.route("/meta")
def meta():
xml = """
<openmensa version="2.1"
xmlns="http://openmensa.org/open-mensa-v2"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://openmensa.org/open-mensa-v2 http://openmensa.org/open-mensa-v2.xsd">
<canteen>
<name>Schraders Bistro</name>
<address>August-Bebel-Straße 26–53 14482 Potsdam</address>
<availability>public</availability>
<feed name="full">
<url>https://openmensa.scm.io/feed</url>
<schedule dayOfMonth="*" dayOfWeek="1" hour="8"/>
</feed>
</canteen>
</openmensa>
"""
return Response(xml, mimetype="text/xml")


@app.route("/health_check")
def health_check():
return make_response("OK", 200)
27 changes: 27 additions & 0 deletions om-bistro/pdf_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import datetime

import requests

# Reference:
# https://ein-anderes-mahl.de/wp-content/uploads/2023/04/Speisen-Schraders-11.09.-15.09.2023.pdf


def get_current_pdf():
current_week_monday = datetime.date.today() - datetime.timedelta(
days=datetime.date.today().weekday()
)
current_week_friday = current_week_monday + datetime.timedelta(days=4)
current_week_monday_string = current_week_monday.strftime("%d.%m.")
current_week_friday_string = current_week_friday.strftime("%d.%m.%Y")
return f"https://ein-anderes-mahl.de/wp-content/uploads/2023/04/Speisen-Schraders-{current_week_monday_string}-{current_week_friday_string}.pdf"


def download_pdf():
pdf_url = get_current_pdf()
r = requests.get(pdf_url)
with open("downloaded.pdf", "wb") as f:
f.write(r.content)


if __name__ == "__main__":
download_pdf()
101 changes: 101 additions & 0 deletions om-bistro/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import datetime
import re

from dateutil.parser import parse
from pdfquery import PDFQuery
from pyopenmensa.feed import LazyBuilder

days = ["Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag"]



def get_text_until_next_day(pdf, day_label):
query = f"LTTextLineHorizontal:contains('{day_label}')"
day_element = pdf.pq(query)[0]

def is_date_demarker(element):
text = element.getchildren()[0].text
for day in [
*days,
"wechselnden Speisen",
]: # "wechselnden Speisen" is a hacky delimiter for the last day
if day in text:
return True
return False

current_element = day_element.getnext()
strings = []
while not is_date_demarker(current_element):
for child in current_element.getchildren():
strings.append(child.text)
current_element = current_element.getnext()
return strings


def filter_texts(texts):
price_regex = r"^\d,\d\d?$"
for text in texts:
text = text.strip()
if text == "":
continue
if re.fullmatch(price_regex, text):
continue
yield text


def parse_texts_into_meals(texts):
match = ""
for text in filter_texts(texts):
if text.startswith("- "):
if match != "":
yield match
match = ""
match += text[2:]
continue
elif match != "":
match += " " + text
if match != "":
yield match


def get_meals_for_day(pdf, day_label):
texts = get_text_until_next_day(pdf, day_label)
return list(parse_texts_into_meals(texts))


def find_monday_date(pdf) -> datetime.date:
try:
date_string = (
pdf.pq("LTTextLineHorizontal:contains('Mittagstisch vom')")[0]
.getchildren()[0]
.text
)
daterx = r"\d\d\.\d\d"
first_date = re.findall(daterx, date_string)[0]
return parse(first_date).date()
except Exception:
print("Could not parse date")


def parse_pdf():
pdf_path = "downloaded.pdf"
pdf = PDFQuery(pdf_path)
pdf.load()

canteen = LazyBuilder()
canteen.name = "Schraders Bistro"
canteen.city = "Potsdam"

current_day = find_monday_date(pdf)

for day in days:
for i, food in enumerate(get_meals_for_day(pdf, day)):
category = "Fleischlos" if i == 0 else "Fleischlich" # Until now, vegetarian food is always first
canteen.addMeal(current_day, category, food)
current_day = current_day + datetime.timedelta(days=1)

return canteen.toXMLFeed()


if __name__ == "__main__":
print(parse_pdf())
5 changes: 5 additions & 0 deletions om-bistro/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pdfquery
pyopenmensa
python-dateutil
requests
flask

0 comments on commit 3cf662c

Please sign in to comment.