Skip to content

Commit

Permalink
guess locale based on how many paths match
Browse files Browse the repository at this point in the history
  • Loading branch information
purarue committed Oct 26, 2023
1 parent fa9e7ec commit 302b06f
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 66 deletions.
2 changes: 1 addition & 1 deletion google_takeout_parser/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def main(verbose: Optional[bool]) -> None:
type=click.Choice(LOCALES, case_sensitive=False),
default=None,
help="Locale to use for matching filenames [default: EN]",
show_default=False,
show_default=True,
envvar="GOOGLE_TAKEOUT_PARSER_LOCALE",
show_envvar=True,
),
Expand Down
64 changes: 29 additions & 35 deletions google_takeout_parser/locales/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Optional, List
from typing import List
from pathlib import Path

from ..log import logger
from .common import HandlerMap
from .en import HANDLER_MAP as EN_DEFAULT_HANDLER_MAP
from .de import HANDLER_MAP as DE_DEFAULT_HANDLER_MAP

Expand All @@ -11,35 +10,30 @@
}


def _log_locale_options() -> None:
logger.info(
f"To silence this message, set the GOOGLE_TAKEOUT_PARSER_LOCALE to one of:: {', '.join(map(repr, LOCALES.keys()))}"
)


def resolve_locale(
locale: Optional[str],
additional_handlers: List[HandlerMap],
) -> List[HandlerMap]:
# additional_handlers is passed by the user in python, should override
if additional_handlers:
logger.debug(
f"Using additional handlers (passed in python code, not based on environment variable): {additional_handlers}"
)
return additional_handlers

if locale is None:
logger.info("No locale specified, using default (EN)")
_log_locale_options()
return [EN_DEFAULT_HANDLER_MAP]

ll = locale.upper()
if ll in LOCALES:
logger.debug(
f"Using locale {ll}. To override set, GOOGLE_TAKEOUT_PARSER_LOCALE"
)
return [LOCALES[ll]]
else:
logger.warning(f"Unknown locale {locale}, using default (EN)")
_log_locale_options()
return [EN_DEFAULT_HANDLER_MAP]
def get_json_activity_paths() -> List[str]:
"""
returns the base directory name for which the json activity parses for every locale
for example, the EN path is:
My Activity/Ads/MyActivity.json
in german its
Meine Aktivität/Werbung/MeineAktivität.json
this will return ['My Activity', 'Meine Aktivität']
this is used in HPI to find the correct directory to parse
note: should probably remove whitespace as well, so like:
['My Activity', 'MyActivity', 'Meine Aktivität', 'MeineAktivität'] when testing against filepaths
"""
from ..parse_json import _parse_json_activity

paths = []
for handler_map in LOCALES.values():
for path, function in handler_map.items():
if function == _parse_json_activity:
paths.append(Path(path.strip("/")).parts[0])

return paths
121 changes: 91 additions & 30 deletions google_takeout_parser/path_dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
from . import __version__ as _google_takeout_version
from .common import Res, PathIsh

from .locales.main import resolve_locale
from .locales.common import BaseResults, HandlerFunction, HandlerMap
from .locales.main import LOCALES, get_json_activity_paths


from .cache import takeout_cache_path
Expand Down Expand Up @@ -118,7 +118,7 @@ def _handler_map_to_list(
"""
handlers: List[HandlerMap] = []
if passed_locale_map is not None:
if isinstance(passed_locale_map, list):
if isinstance(passed_locale_map, Sequence):
for h in passed_locale_map:
assert isinstance(h, dict), f"Expected dict, got {type(h)}"
handlers.append(h)
Expand Down Expand Up @@ -172,38 +172,87 @@ def __init__(
self.error_policy: ErrorPolicy = error_policy
self.warn_exceptions = warn_exceptions
self.handlers = self._resolve_locale_handler_map(
locale_name=locale_name, passed_locale_map=handlers
takeout_dir=self.takeout_dir,
locale_name=locale_name,
passed_locale_map=handlers,
)
# TODO: check if there's some directory we expect to be there based on the computed handler map instead?
# self._warn_if_no_activity()
self._warn_if_no_activity()

@staticmethod
@classmethod
def _resolve_locale_handler_map(
cls,
*,
takeout_dir: Path,
locale_name: Optional[str],
passed_locale_map: Union[HandlerMap, List[HandlerMap], None] = None,
) -> List[HandlerMap]:

# any passed locale map overrides the environment variable, this would only
# really be done by someone calling this manually in python
handlers = _handler_map_to_list(passed_locale_map)
if len(handlers) > 0:
return handlers

# if no locale is specified, use the environment variable
if locale_name is None:
locale_name = os.environ.get("GOOGLE_TAKEOUT_PARSER_LOCALE", "EN")
locale_name = os.environ.get("GOOGLE_TAKEOUT_PARSER_LOCALE")

return resolve_locale(locale_name, handlers)
if locale_name is not None:
logger.debug(f"User specified locale: {locale_name}")

def _warn_if_no_activity(self) -> None:
# most common is probably 'My Activity'?
# can be used as a check to see if the user passed a wrong directory

# TODO: extract activity_dir from selected DEFAULT_HANDLER_MAP
activity_dir = "My Activity"
expected = self.takeout_dir / activity_dir
if not expected.exists():
logger.warning(
f"Warning: given '{self.takeout_dir}', expected the '{activity_dir}' directory at '{expected}'. Perhaps you passed the wrong location?"
if locale_name is not None and locale_name in LOCALES:
logger.debug(
f"Using locale {locale_name}. To override set, GOOGLE_TAKEOUT_PARSER_LOCALE"
)
return [LOCALES[locale_name]]

# if not provided, guess by using the dispatch map with all known handlers,
# using the one with the maximum number of matches
return cls._guess_locale(takeout_dir=takeout_dir)

@classmethod
def _guess_locale(
cls,
*,
takeout_dir: Path,
) -> List[HandlerMap]:
logger.debug(
"No locale specified, guessing based on how many filepaths match from each locale"
)
locale_scores: Dict[str, int] = {
locale_name: len(
cls.dispatch_map_pure(
takeout_dir=takeout_dir,
handler_maps=[locale_map],
warn_exceptions=False, # dont warn here, we expect a bunch of path misses
)
)
for locale_name, locale_map in LOCALES.items()
}

logger.debug(f"Locale scores: {locale_scores}")

# if theres multiple max values, return both of them
max_score = max(locale_scores.values())

matched_locales = [
name for name, score in locale_scores.items() if score == max_score
]

logger.debug(f"Using locales: {matched_locales}")

return [LOCALES[name] for name in matched_locales]

def _warn_if_no_activity(self) -> None:
expect_one_of = get_json_activity_paths()

for activity_dir in expect_one_of:
if (self.takeout_dir / activity_dir).exists():
return

logger.warning(
f"Warning: given '{self.takeout_dir}', expected one of '{expect_one_of}' to exist, perhaps you passed the wrong location?"
)

@staticmethod
def _match_handler(p: Path, handler: HandlerMap) -> HandlerMatch:
Expand All @@ -224,24 +273,40 @@ def _match_handler(p: Path, handler: HandlerMap) -> HandlerMatch:
else:
return RuntimeError(f"No function to handle parsing {sf}")

# TODO: cache? may run into issues though
def dispatch_map(self) -> Dict[Path, HandlerFunction]:
return self.dispatch_map_pure(
takeout_dir=self.takeout_dir,
handler_maps=self.handlers,
warn_exceptions=self.warn_exceptions,
)

@classmethod
def dispatch_map_pure(
cls,
*,
takeout_dir: Path,
handler_maps: List[HandlerMap],
warn_exceptions: bool = True,
) -> Dict[Path, HandlerFunction]:
"""
A pure function for dispatch map so it can be used in other contexts (e.g. to detect locales by scanning the directory)
"""
res: Dict[Path, HandlerFunction] = {}
for f in sorted(self.takeout_dir.rglob("*")):
for f in sorted(takeout_dir.rglob("*")):
if f.name.startswith("."):
continue
if not f.is_file():
continue
rf = f.relative_to(self.takeout_dir)
rf = f.relative_to(takeout_dir)

# try to resolve file to parser-function by checking all supplied handlers

# cache handler information for warning if we can't resolve the file
file_resolved: bool = False
handler_exception: Optional[Exception] = None

for handler in self.handlers:
file_handler: HandlerMatch = self.__class__._match_handler(rf, handler)
for handler in handler_maps:
file_handler: HandlerMatch = cls._match_handler(rf, handler)
# file_handler matched something
if not isinstance(file_handler, Exception):
# if not explicitly ignored by the handler map
Expand All @@ -256,7 +321,7 @@ def dispatch_map(self) -> Dict[Path, HandlerFunction]:
# this is an exception specifying an unhandled file
# this shouldn't cause a fatal error, so don't check
# error_policy here, just warn the user
if self.warn_exceptions:
if warn_exceptions:
logger.warning(str(handler_exception))

return res
Expand Down Expand Up @@ -293,9 +358,7 @@ def _handle_errors(self, results: BaseResults) -> BaseResults:
elif self.error_policy == "drop":
continue

def parse(
self, cache: bool = False, filter_type: FilterType = None
) -> BaseResults:
def parse(self, cache: bool = False, filter_type: FilterType = None) -> BaseResults:
"""
Parses the Takeout
Expand Down Expand Up @@ -367,9 +430,7 @@ def _determine_cache_path(self, cache_key: CacheKey) -> str:
part = os.path.join(*self.takeout_dir.parts[1:])
return str(base / part / _cache_key_to_str(cache_key))

def _cached_parse(
self, filter_type: FilterType = None
) -> BaseResults:
def _cached_parse(self, filter_type: FilterType = None) -> BaseResults:
handlers = self._group_by_return_type(filter_type=filter_type)
for cache_key, result_tuples in handlers.items():
_ret_type: Any = _cache_key_to_type(cache_key)
Expand Down
5 changes: 5 additions & 0 deletions tests/test_google_takeout.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from google_takeout_parser.path_dispatch import TakeoutParser
from google_takeout_parser.locales.main import LOCALES

from .common import testdata

Expand All @@ -12,6 +13,8 @@ def test_structure() -> None:
assert len(files) == 53
assert len(m) == 35

assert tk._guess_locale(takeout_dir=tk.takeout_dir) == [LOCALES["EN"]]


def test_structure_ger() -> None:
recent_takeout = testdata / "RecentTakeout_ger"
Expand All @@ -21,3 +24,5 @@ def test_structure_ger() -> None:
m = tk.dispatch_map()
assert len(files) == 51
assert len(m) == 7

assert tk._guess_locale(takeout_dir=tk.takeout_dir) == [LOCALES["DE"]]
9 changes: 9 additions & 0 deletions tests/test_locale_paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from google_takeout_parser.locales.main import get_json_activity_paths, LOCALES


def test_locale_paths() -> None:
jpths = get_json_activity_paths()
assert len(jpths) > len(LOCALES)

assert "My Activity" in jpths
assert "Meine Aktivitäten" in jpths

0 comments on commit 302b06f

Please sign in to comment.