Skip to content

Commit

Permalink
Fixed #250 #251: add name to languages
Browse files Browse the repository at this point in the history
Now using real domains in Name and filename as well as language code.
Language metadata is also properly populated.
Language is extracted from domain name using known patterns.
Meta domains are all considered English.
  • Loading branch information
rgaudin committed May 15, 2022
1 parent c5b78c3 commit 31c9ce3
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 2 deletions.
20 changes: 19 additions & 1 deletion src/sotoki/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from typing import Optional, List
from dataclasses import dataclass, field

from zimscraperlib.i18n import get_language_details, NotFound

ROOT_DIR = pathlib.Path(__file__).parent
NAME = ROOT_DIR.name

Expand Down Expand Up @@ -40,6 +42,19 @@
NB_PAGINATED_USERS = NB_USERS_PER_PAGE * NB_USERS_PAGES


def lang_for_domain(domain):
match = re.match(r"^(?P<lang>[a-z]+)\.(stackexchange|stackoverflow)\.com$", domain)
if match:
so_code = match.groupdict()["lang"]
if so_code != "meta":
try:
lang = get_language_details(so_code)
return lang["iso-639-1"], lang["iso-639-3"]
except NotFound:
...
return "en", "eng"


@dataclass
class Sotoconf:
required = [
Expand All @@ -61,6 +76,8 @@ class Sotoconf:
publisher: Optional[str] = ""
fname: Optional[str] = ""
tag: List[str] = field(default_factory=list)
iso_lang_1: str = "en" # ISO-639-1
iso_lang_3: str = "eng" # ISO-639-3

# customization
favicon: Optional[str] = ""
Expand Down Expand Up @@ -147,7 +164,8 @@ def any_restriction(self):
def __post_init__(self):
self.dump_domain = self.domain # dumps are named after unfixed domains
self.domain = FIXED_DOMAINS.get(self.domain, self.domain)
self.name = self.domain
self.iso_lang_1, self.iso_lang_3 = lang_for_domain(self.domain)
self.name = f"{self.domain}_{self.iso_lang_1}"
self.output_dir = pathlib.Path(self._output_dir).expanduser().resolve()
self.output_dir.mkdir(parents=True, exist_ok=True)
self.tmp_dir = pathlib.Path(self._tmp_dir).expanduser().resolve()
Expand Down
1 change: 1 addition & 0 deletions src/sotoki/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ def run(self):
logger.info(
f"Starting scraper with:\n"
f" domain: {self.domain}\n"
f" lang: {self.conf.iso_lang_1} ({self.conf.iso_lang_3})\n"
f" build_dir: {self.build_dir}\n"
f" output_dir: {self.conf.output_dir}\n"
f"{s3_msg}"
Expand Down
2 changes: 1 addition & 1 deletion src/sotoki/utils/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def setup():
filename=Global.conf.output_dir.joinpath(Global.conf.fname),
main_path="questions",
favicon_path="illustration",
language="eng",
language=Global.conf.iso_lang_3,
title=Global.conf.title,
description=Global.conf.description,
creator=Global.conf.author,
Expand Down

0 comments on commit 31c9ce3

Please sign in to comment.