Skip to content

Commit

Permalink
om-bistro: use tesseract (#49)
Browse files Browse the repository at this point in the history
  • Loading branch information
frcroth authored Feb 28, 2024
1 parent b32c386 commit b6a19b3
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 123 deletions.
30 changes: 0 additions & 30 deletions README.md

This file was deleted.

2 changes: 2 additions & 0 deletions om-bistro/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ FROM python:3.8-slim-buster

WORKDIR /python-docker

RUN apt-get update && apt-get install -y poppler-utils tesseract-ocr tesseract-ocr-deu

COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt

Expand Down
15 changes: 15 additions & 0 deletions om-bistro/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Schraders Bistro openmensa parser

This is an openmensa parser for [Schraders Bistro](https://ein-anderes-mahl.de/bistro-babelsberg/).

To run it locally, follow these steps:
1. Install dependencies (in a virtual env) with `pip install -r requirements.txt`
2. Run the server with `python -m flask --app openmensa_server run`
3. Navigate to `localhost:5000/feed`

You can also run the docker container locally:

```shell
docker build -t openmensa-schrader .
docker run -p 5000:5000 openmensa-schrader
```
172 changes: 81 additions & 91 deletions om-bistro/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,106 +2,96 @@
import re

from dateutil.parser import parse
from pdfquery import PDFQuery
from pyopenmensa.feed import LazyBuilder

days = ["Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag"]


def get_text_until_next_day(pdf, day_label):
try:
query = f"LTTextLineHorizontal:contains('{day_label}')"
day_element = pdf.pq(query)[0]
except IndexError:
# Try without the first letter, it is sometimes cut off
query = f"LTTextLineHorizontal:contains('{day_label[1:]}')"
day_element = pdf.pq(query)[0]

def is_date_demarker(element):
text = element.getchildren()[0].text
for day in [
*days,
*[day[1:] for day in days], # Sometimes the first letter is cut off
"wechselnden Speisen",
]: # "wechselnden Speisen" is a hacky delimiter for the last day
if day in text:
return True
return False

current_element = day_element.getnext()
strings = []
while not is_date_demarker(current_element):
for child in current_element.getchildren():
strings.append(child.text)
current_element = current_element.getnext()
return strings


def filter_texts(texts):
price_regex = r"^\d,\d\d?$"
for text in texts:
text = text.strip()
if text == "":
continue
if re.fullmatch(price_regex, text):
continue
yield text


def parse_texts_into_meals(texts):
match = ""
for text in filter_texts(texts):
if text.startswith("- "):
if match != "":
yield match
match = ""
match += text[2:]
continue
elif match != "":
match += " " + text
if match != "":
yield match


def get_meals_for_day(pdf, day_label):
texts = get_text_until_next_day(pdf, day_label)
return list(parse_texts_into_meals(texts))


def find_monday_date(pdf) -> datetime.date:
try:
date_string = (
pdf.pq("LTTextLineHorizontal:contains('Mittagstisch vom')")[0]
.getchildren()[0]
.text
)
daterx = r"\d\d\.\d\d"
first_date = re.findall(daterx, date_string)[0]
return parse(first_date).date()
except Exception:
print("Could not parse date")

from pdf2image import convert_from_path
from PIL import Image
import pyocr


def read_image():
lang = "deu"
tool = pyocr.get_available_tools()[0]

txt = tool.image_to_string(
Image.open("out.png"), lang=lang, builder=pyocr.builders.TextBuilder()
)
return txt


def parse_ocr_text(txt):

# Discard everything before "Montag"
txt = txt[txt.index("Montag") :]
# Discard everything after the last day (before "Portionen fleischlos")
txt = txt[: txt.rindex("Portionen fleischlos")]

# Split on lines containing a day
day_offers = re.split(r"\n(?=\w+Montag|Dienstag|Mittwoch|Donnerstag|Freitag)", txt)
# First line will be the day, the rest will be the offers
day_offers = [re.split(r"\n", day_offer) for day_offer in day_offers]
# discard empty lines
day_offers = [[line for line in day_offer if line] for day_offer in day_offers]
# discard the day
day_offers = [day_offer[1:] for day_offer in day_offers]
# "-" marks the beginning of an offer: add lines without "-" to the previous offer
# fix this by going through the lines in reverse order
for i, day_offer in enumerate(day_offers):
for j in range(len(day_offer) - 1, 0, -1):
if not day_offer[j].startswith("-"):
day_offer[j - 1] += " " + day_offer.pop(j)

# Remove leading "-" from offers
for day_offer in day_offers:
for i, offer in enumerate(day_offer):
if offer.startswith("-"):
day_offer[i] = offer[1:].strip()

# At the end of the line, there is the price in the format "X,X": convert each offer to a tuple (offer, price as float)

day_offers = [
[
(offer[: offer.rindex(" ")], float(offer[offer.rindex(" ") + 1:].replace(",", ".")))
for offer in day_offer
]
for day_offer in day_offers
]

return day_offers

def find_monday_date(txt):
# The line "Mittagstisch vom 26.02. - 01.03.2024" contains the date of the monday
match = re.search(r"Mittagstisch vom ([\d.]+)", txt)
date_string = match.group(1)
# Add year to datestring
date_string += str(datetime.datetime.now().year)
date = parse(date_string, dayfirst=True)
# If the date is not a monday, find the next monday
while date.weekday() != 0:
date += datetime.timedelta(days=1)

# datetime to date
return date.date()

def parse_pdf():
pdf_path = "downloaded.pdf"
pdf = PDFQuery(pdf_path)
pdf.load()
images = convert_from_path(pdf_path)
image = images[0]
image.save("out.png")
txt = read_image()
parsed = parse_ocr_text(txt)

canteen = LazyBuilder()
canteen.name = "Schraders Bistro"
canteen.city = "Potsdam"

current_day = find_monday_date(pdf)

for day in days:
try:
for i, food in enumerate(get_meals_for_day(pdf, day)):
category = (
"Fleischlos" if i == 0 else "Fleischlich"
) # Until now, vegetarian food is always first
canteen.addMeal(current_day, category, food)
except Exception:
print(f"Could not parse day {day}")
current_day = find_monday_date(txt)

for day in parsed:
for i, offer in enumerate(day):
category = "Fleischlos" if i == 0 else "Fleischlich" # Until now, vegetarian food is always first
food = offer[0]
price = offer[1]
canteen.addMeal(current_day, category, food, prices = {"other" : price})
current_day = current_day + datetime.timedelta(days=1)

return canteen.toXMLFeed()
Expand Down
6 changes: 4 additions & 2 deletions om-bistro/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
pdfquery
pyopenmensa
python-dateutil
requests
flask
flask
pdf2image
pyocr
pytesseract

0 comments on commit b6a19b3

Please sign in to comment.