From 302b06f329c4cb9251a845d767fa9c1a32f88970 Mon Sep 17 00:00:00 2001 From: Sean Breckenridge Date: Wed, 25 Oct 2023 19:53:49 -0700 Subject: [PATCH] guess locale based on how many paths match --- google_takeout_parser/__main__.py | 2 +- google_takeout_parser/locales/main.py | 64 ++++++------- google_takeout_parser/path_dispatch.py | 121 +++++++++++++++++++------ tests/test_google_takeout.py | 5 + tests/test_locale_paths.py | 9 ++ 5 files changed, 135 insertions(+), 66 deletions(-) create mode 100644 tests/test_locale_paths.py diff --git a/google_takeout_parser/__main__.py b/google_takeout_parser/__main__.py index 32ef79c..ab33812 100644 --- a/google_takeout_parser/__main__.py +++ b/google_takeout_parser/__main__.py @@ -62,7 +62,7 @@ def main(verbose: Optional[bool]) -> None: type=click.Choice(LOCALES, case_sensitive=False), default=None, help="Locale to use for matching filenames [default: EN]", - show_default=False, + show_default=True, envvar="GOOGLE_TAKEOUT_PARSER_LOCALE", show_envvar=True, ), diff --git a/google_takeout_parser/locales/main.py b/google_takeout_parser/locales/main.py index 9031733..1bb65ca 100644 --- a/google_takeout_parser/locales/main.py +++ b/google_takeout_parser/locales/main.py @@ -1,7 +1,6 @@ -from typing import Optional, List +from typing import List +from pathlib import Path -from ..log import logger -from .common import HandlerMap from .en import HANDLER_MAP as EN_DEFAULT_HANDLER_MAP from .de import HANDLER_MAP as DE_DEFAULT_HANDLER_MAP @@ -11,35 +10,30 @@ } -def _log_locale_options() -> None: - logger.info( - f"To silence this message, set the GOOGLE_TAKEOUT_PARSER_LOCALE to one of:: {', '.join(map(repr, LOCALES.keys()))}" - ) - - -def resolve_locale( - locale: Optional[str], - additional_handlers: List[HandlerMap], -) -> List[HandlerMap]: - # additional_handlers is passed by the user in python, should override - if additional_handlers: - logger.debug( - f"Using additional handlers (passed in python code, not based on environment variable): {additional_handlers}" - ) - return additional_handlers - - if locale is None: - logger.info("No locale specified, using default (EN)") - _log_locale_options() - return [EN_DEFAULT_HANDLER_MAP] - - ll = locale.upper() - if ll in LOCALES: - logger.debug( - f"Using locale {ll}. To override set, GOOGLE_TAKEOUT_PARSER_LOCALE" - ) - return [LOCALES[ll]] - else: - logger.warning(f"Unknown locale {locale}, using default (EN)") - _log_locale_options() - return [EN_DEFAULT_HANDLER_MAP] +def get_json_activity_paths() -> List[str]: + """ + returns the base directory name for which the json activity parses for every locale + + for example, the EN path is: + My Activity/Ads/MyActivity.json + + in german its + Meine Aktivität/Werbung/MeineAktivität.json + + this will return ['My Activity', 'Meine Aktivität'] + + this is used in HPI to find the correct directory to parse + + note: should probably remove whitespace as well, so like: + + ['My Activity', 'MyActivity', 'Meine Aktivität', 'MeineAktivität'] when testing against filepaths + """ + from ..parse_json import _parse_json_activity + + paths = [] + for handler_map in LOCALES.values(): + for path, function in handler_map.items(): + if function == _parse_json_activity: + paths.append(Path(path.strip("/")).parts[0]) + + return paths diff --git a/google_takeout_parser/path_dispatch.py b/google_takeout_parser/path_dispatch.py index b28a177..4bec2b2 100644 --- a/google_takeout_parser/path_dispatch.py +++ b/google_takeout_parser/path_dispatch.py @@ -26,8 +26,8 @@ from . import __version__ as _google_takeout_version from .common import Res, PathIsh -from .locales.main import resolve_locale from .locales.common import BaseResults, HandlerFunction, HandlerMap +from .locales.main import LOCALES, get_json_activity_paths from .cache import takeout_cache_path @@ -118,7 +118,7 @@ def _handler_map_to_list( """ handlers: List[HandlerMap] = [] if passed_locale_map is not None: - if isinstance(passed_locale_map, list): + if isinstance(passed_locale_map, Sequence): for h in passed_locale_map: assert isinstance(h, dict), f"Expected dict, got {type(h)}" handlers.append(h) @@ -172,38 +172,87 @@ def __init__( self.error_policy: ErrorPolicy = error_policy self.warn_exceptions = warn_exceptions self.handlers = self._resolve_locale_handler_map( - locale_name=locale_name, passed_locale_map=handlers + takeout_dir=self.takeout_dir, + locale_name=locale_name, + passed_locale_map=handlers, ) - # TODO: check if there's some directory we expect to be there based on the computed handler map instead? - # self._warn_if_no_activity() + self._warn_if_no_activity() - @staticmethod + @classmethod def _resolve_locale_handler_map( + cls, *, + takeout_dir: Path, locale_name: Optional[str], passed_locale_map: Union[HandlerMap, List[HandlerMap], None] = None, ) -> List[HandlerMap]: + # any passed locale map overrides the environment variable, this would only # really be done by someone calling this manually in python handlers = _handler_map_to_list(passed_locale_map) + if len(handlers) > 0: + return handlers # if no locale is specified, use the environment variable if locale_name is None: - locale_name = os.environ.get("GOOGLE_TAKEOUT_PARSER_LOCALE", "EN") + locale_name = os.environ.get("GOOGLE_TAKEOUT_PARSER_LOCALE") - return resolve_locale(locale_name, handlers) + if locale_name is not None: + logger.debug(f"User specified locale: {locale_name}") - def _warn_if_no_activity(self) -> None: - # most common is probably 'My Activity'? - # can be used as a check to see if the user passed a wrong directory - - # TODO: extract activity_dir from selected DEFAULT_HANDLER_MAP - activity_dir = "My Activity" - expected = self.takeout_dir / activity_dir - if not expected.exists(): - logger.warning( - f"Warning: given '{self.takeout_dir}', expected the '{activity_dir}' directory at '{expected}'. Perhaps you passed the wrong location?" + if locale_name is not None and locale_name in LOCALES: + logger.debug( + f"Using locale {locale_name}. To override set, GOOGLE_TAKEOUT_PARSER_LOCALE" ) + return [LOCALES[locale_name]] + + # if not provided, guess by using the dispatch map with all known handlers, + # using the one with the maximum number of matches + return cls._guess_locale(takeout_dir=takeout_dir) + + @classmethod + def _guess_locale( + cls, + *, + takeout_dir: Path, + ) -> List[HandlerMap]: + logger.debug( + "No locale specified, guessing based on how many filepaths match from each locale" + ) + locale_scores: Dict[str, int] = { + locale_name: len( + cls.dispatch_map_pure( + takeout_dir=takeout_dir, + handler_maps=[locale_map], + warn_exceptions=False, # dont warn here, we expect a bunch of path misses + ) + ) + for locale_name, locale_map in LOCALES.items() + } + + logger.debug(f"Locale scores: {locale_scores}") + + # if theres multiple max values, return both of them + max_score = max(locale_scores.values()) + + matched_locales = [ + name for name, score in locale_scores.items() if score == max_score + ] + + logger.debug(f"Using locales: {matched_locales}") + + return [LOCALES[name] for name in matched_locales] + + def _warn_if_no_activity(self) -> None: + expect_one_of = get_json_activity_paths() + + for activity_dir in expect_one_of: + if (self.takeout_dir / activity_dir).exists(): + return + + logger.warning( + f"Warning: given '{self.takeout_dir}', expected one of '{expect_one_of}' to exist, perhaps you passed the wrong location?" + ) @staticmethod def _match_handler(p: Path, handler: HandlerMap) -> HandlerMatch: @@ -224,15 +273,31 @@ def _match_handler(p: Path, handler: HandlerMap) -> HandlerMatch: else: return RuntimeError(f"No function to handle parsing {sf}") - # TODO: cache? may run into issues though def dispatch_map(self) -> Dict[Path, HandlerFunction]: + return self.dispatch_map_pure( + takeout_dir=self.takeout_dir, + handler_maps=self.handlers, + warn_exceptions=self.warn_exceptions, + ) + + @classmethod + def dispatch_map_pure( + cls, + *, + takeout_dir: Path, + handler_maps: List[HandlerMap], + warn_exceptions: bool = True, + ) -> Dict[Path, HandlerFunction]: + """ + A pure function for dispatch map so it can be used in other contexts (e.g. to detect locales by scanning the directory) + """ res: Dict[Path, HandlerFunction] = {} - for f in sorted(self.takeout_dir.rglob("*")): + for f in sorted(takeout_dir.rglob("*")): if f.name.startswith("."): continue if not f.is_file(): continue - rf = f.relative_to(self.takeout_dir) + rf = f.relative_to(takeout_dir) # try to resolve file to parser-function by checking all supplied handlers @@ -240,8 +305,8 @@ def dispatch_map(self) -> Dict[Path, HandlerFunction]: file_resolved: bool = False handler_exception: Optional[Exception] = None - for handler in self.handlers: - file_handler: HandlerMatch = self.__class__._match_handler(rf, handler) + for handler in handler_maps: + file_handler: HandlerMatch = cls._match_handler(rf, handler) # file_handler matched something if not isinstance(file_handler, Exception): # if not explicitly ignored by the handler map @@ -256,7 +321,7 @@ def dispatch_map(self) -> Dict[Path, HandlerFunction]: # this is an exception specifying an unhandled file # this shouldn't cause a fatal error, so don't check # error_policy here, just warn the user - if self.warn_exceptions: + if warn_exceptions: logger.warning(str(handler_exception)) return res @@ -293,9 +358,7 @@ def _handle_errors(self, results: BaseResults) -> BaseResults: elif self.error_policy == "drop": continue - def parse( - self, cache: bool = False, filter_type: FilterType = None - ) -> BaseResults: + def parse(self, cache: bool = False, filter_type: FilterType = None) -> BaseResults: """ Parses the Takeout @@ -367,9 +430,7 @@ def _determine_cache_path(self, cache_key: CacheKey) -> str: part = os.path.join(*self.takeout_dir.parts[1:]) return str(base / part / _cache_key_to_str(cache_key)) - def _cached_parse( - self, filter_type: FilterType = None - ) -> BaseResults: + def _cached_parse(self, filter_type: FilterType = None) -> BaseResults: handlers = self._group_by_return_type(filter_type=filter_type) for cache_key, result_tuples in handlers.items(): _ret_type: Any = _cache_key_to_type(cache_key) diff --git a/tests/test_google_takeout.py b/tests/test_google_takeout.py index b8de0e5..fca3787 100644 --- a/tests/test_google_takeout.py +++ b/tests/test_google_takeout.py @@ -1,4 +1,5 @@ from google_takeout_parser.path_dispatch import TakeoutParser +from google_takeout_parser.locales.main import LOCALES from .common import testdata @@ -12,6 +13,8 @@ def test_structure() -> None: assert len(files) == 53 assert len(m) == 35 + assert tk._guess_locale(takeout_dir=tk.takeout_dir) == [LOCALES["EN"]] + def test_structure_ger() -> None: recent_takeout = testdata / "RecentTakeout_ger" @@ -21,3 +24,5 @@ def test_structure_ger() -> None: m = tk.dispatch_map() assert len(files) == 51 assert len(m) == 7 + + assert tk._guess_locale(takeout_dir=tk.takeout_dir) == [LOCALES["DE"]] diff --git a/tests/test_locale_paths.py b/tests/test_locale_paths.py new file mode 100644 index 0000000..887fd4c --- /dev/null +++ b/tests/test_locale_paths.py @@ -0,0 +1,9 @@ +from google_takeout_parser.locales.main import get_json_activity_paths, LOCALES + + +def test_locale_paths() -> None: + jpths = get_json_activity_paths() + assert len(jpths) > len(LOCALES) + + assert "My Activity" in jpths + assert "Meine Aktivitäten" in jpths