Fix some dataloaders' metadata and fix push_to_hub

SEACrowd · Jun 24, 2024 · e2c6207 · e2c6207
1 parent 7e6f76c
commit e2c6207
Show file tree

Hide file tree

Showing 7 changed files with 80 additions and 61 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ Southeast Asia is home to more than 1,000 native languages. Nevertheless, Southe
 
 ### Library Installation
 
-Find seacrowd library (v0.2.0) at https://pypi.org/project/seacrowd/. (See our release notes [here](https://github.com/SEACrowd/seacrowd-datahub/releases/tag/0.2.0).)
+Find seacrowd library (v0.2.2) at https://pypi.org/project/seacrowd/. (See our release notes [here](https://github.com/SEACrowd/seacrowd-datahub/releases/tag/0.2.0).)
 
 To install SEACrowd, install the `seacrowd` package in your python environment via `pip`.
 

diff --git a/seacrowd/sea_datasets/id_frog_story/id_frog_story.py b/seacrowd/sea_datasets/id_frog_story/id_frog_story.py
@@ -6,7 +6,7 @@
 
 from seacrowd.utils import schemas
 from seacrowd.utils.configs import SEACrowdConfig
-from seacrowd.utils.constants import Tasks
+from seacrowd.utils.constants import Tasks, Licenses
 
 _CITATION = """\
 @article{FrogStorytelling,
@@ -31,7 +31,7 @@
 """
 _HOMEPAGE = "https://github.com/matbahasa/corpus-frog-storytelling"
 _LANGUAGES = ["ind"]
-_LICENSE = "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)"
+_LICENSE = Licenses.CC_BY_SA_4_0.value
 _LOCAL = False
 _URLS = {
     _DATASETNAME: "https://github.com/matbahasa/corpus-frog-storytelling/archive/refs/heads/master.zip",

diff --git a/seacrowd/sea_datasets/jv_id_tts/jv_id_tts.py b/seacrowd/sea_datasets/jv_id_tts/jv_id_tts.py
@@ -7,7 +7,7 @@
 
 from seacrowd.utils import schemas
 from seacrowd.utils.configs import SEACrowdConfig
-from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME,
+from seacrowd.utils.constants import (DEFAULT_SEACROWD_VIEW_NAME, Licenses,
                                        DEFAULT_SOURCE_VIEW_NAME, Tasks)
 
 _DATASETNAME = "jv_id_tts"
@@ -38,7 +38,7 @@
 
 _HOMEPAGE = "http://openslr.org/41/"
 
-_LICENSE = "See https://www.openslr.org/resources/41/LICENSE file for license information. Attribution-ShareAlike 4.0 (CC BY-SA 4.0)."
+_LICENSE = Licenses.CC_BY_SA_4_0.value
 
 _URLs = {
     _DATASETNAME: {

diff --git a/seacrowd/sea_datasets/ojw/ojw.py b/seacrowd/sea_datasets/ojw/ojw.py
@@ -24,6 +24,7 @@
 import pandas as pd
 
 from seacrowd.utils.configs import SEACrowdConfig
+from seacrowd.utils.constants import Licenses
 
 _CITATION = """\
 @inproceedings{moeljadi-aminullah-2020-building,
@@ -61,7 +62,7 @@
 _HOMEPAGE = "https://github.com/davidmoeljadi/OJW"
 
 
-_LICENSE = "Creative Commons Attribution 4.0 International (CC BY 4.0)"
+_LICENSE = Licenses.CC_BY_SA_4_0.value
 
 
 _URLS = {

diff --git a/seacrowd/sea_datasets/titml_idn/titml_idn.py b/seacrowd/sea_datasets/titml_idn/titml_idn.py
@@ -7,7 +7,7 @@
 
 from seacrowd.utils import schemas
 from seacrowd.utils.configs import SEACrowdConfig
-from seacrowd.utils.constants import Tasks, DEFAULT_SOURCE_VIEW_NAME, DEFAULT_SEACROWD_VIEW_NAME
+from seacrowd.utils.constants import Licenses, Tasks, DEFAULT_SOURCE_VIEW_NAME, DEFAULT_SEACROWD_VIEW_NAME
 
 _DATASETNAME = "titml_idn"
 _SOURCE_VIEW_NAME = DEFAULT_SOURCE_VIEW_NAME
@@ -31,7 +31,7 @@
 
 _HOMEPAGE = "http://research.nii.ac.jp/src/en/TITML-IDN.html"
 
-_LICENSE = "For research purposes only. If you use this corpus, you have to cite (Lestari et al, 2006)."
+_LICENSE = Licenses.OTHERS.value + " | For research purposes only. If you use this corpus, you have to cite (Lestari et al, 2006)."
 
 _URLs = {"titml-idn": "https://huggingface.co/datasets/holylovenia/TITML-IDN/resolve/main/IndoLVCSR.zip"}
 

diff --git a/seacrowd/sea_datasets/unimorph_id/unimorph_id.py b/seacrowd/sea_datasets/unimorph_id/unimorph_id.py
@@ -20,7 +20,7 @@
 
 from seacrowd.utils import schemas
 from seacrowd.utils.configs import SEACrowdConfig
-from seacrowd.utils.constants import Tasks
+from seacrowd.utils.constants import Tasks, Licenses
 
 _CITATION = """\
 @inproceedings{pimentel-ryskina-etal-2021-sigmorphon,
@@ -105,7 +105,7 @@
 
 _HOMEPAGE = "https://github.com/unimorph/ind"
 
-_LICENSE = "Creative Commons Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)"
+_LICENSE = Licenses.CC_BY_SA_3_0.value
 
 _URLS = {
     _DATASETNAME: "https://raw.githubusercontent.com/unimorph/ind/main/ind",

diff --git a/seacrowd/utils/push_to_hub.py b/seacrowd/utils/push_to_hub.py
@@ -33,20 +33,35 @@ def construct_readme(dsetname):
     citation = import_from(module_path, "_CITATION")
     license = import_from(module_path, "_LICENSE")
 
-    languages_part = "\n- " + "\n- ".join([lang for lang in languages if len(lang) <= 3])
-    pretty_name_part = dset_name.replace("_", " ").title()
-    task_categories_part = "\n- " + "\n- ".join(task.name.replace("_", "-").lower() for task in supported_tasks)
+    readme_string = "\n---"
     if "(" in license and ")" in license:
         license_part = license[license.find("(")+1:license.find(")")]
-        readme_string = f'\n---\nlicense: {license_part}\nlanguage: {languages_part}\npretty_name: {pretty_name_part}\ntask_categories: {task_categories_part}\ntags: {task_categories_part}\n---\n'
-    else:
-        readme_string = f'\n---\nlanguage: {languages_part}\npretty_name: {pretty_name_part}\ntask_categories: {task_categories_part}\ntags: {task_categories_part}\n---\n'
-    readme_string += f'\n\n# {pretty_name_part}'
+        if license_part == "others":
+            license_part = "other"
+        readme_string += f'\nlicense: {license_part}'
+
+    languages_part = "\n- " + "\n- ".join([lang for lang in languages if len(lang) <= 3])
+    readme_string += f'\nlanguage: {languages_part}'
+
+    pretty_name_part = dset_name.replace("_", " ").title()
+    readme_string += f'\npretty_name: {pretty_name_part}'
+
+    tasks = [task.name.replace("_", "-").lower() for task in supported_tasks]
+    if len(tasks) > 0:
+        task_categories_part = "\n- " + "\n- ".join(tasks)
+        readme_string += f'\ntask_categories: {task_categories_part}'
+        readme_string += f'\ntags: {task_categories_part}'
+
+    readme_string += '\n---'
+
     readme_string += f'\n\n{description}'
     if is_local:
         readme_string += "\n\nThis is a local dataset. You have to obtain this dataset separately from [{homepage}]({homepage}) to use this dataloader."
+
     readme_string += f'\n\n## Languages\n\n{", ".join(languages)}'
+
     readme_string += f'\n\n## Supported Tasks\n\n{", ".join([str(task.name.replace("_", " ").title()) for task in supported_tasks])}'
+
     readme_string += f'''
     \n## Dataset Usage
     ### Using `datasets` library
@@ -72,7 +87,7 @@ def construct_readme(dsetname):
     readme_string += f'\n\n## Citation\n\nIf you are using the **{dset_name.replace("_", " ").title()}** dataloader in your work, please cite the following:'
     readme_string = re.sub(r"( )+\#", "#", readme_string)
     readme_string = re.sub(r"( )+\`\`\`", "```", readme_string)
-    readme_string = re.sub(r"( ){2, 4}", "", readme_string)
+    readme_string = re.sub(r"[ \t]{2,}", "", readme_string)
     readme_string += f'\n```\n{citation}\n{_SEACROWD_CITATION}\n```'
     readme_string = re.sub(r"( )+\@", "@", readme_string)
     return readme_string
@@ -85,49 +100,52 @@ def construct_readme(dsetname):
 
     requirements_file = BytesIO(str.encode("seacrowd>=0.2.0"))
 
-    # for dirname in ["indolem_sentiment"]:
     for i, dirname in enumerate(os.listdir(_SEA_DATASETS_PATH)):
-        if not os.path.isdir(f"{_SEA_DATASETS_PATH}/{dirname}/"):
-            print(f"{dirname} is not a directory.")
+        if not os.path.isdir(f"{_SEA_DATASETS_PATH}/{dirname}/") or dirname == "__pycache__":
+            print(f"{dirname} is not a dataloader name.")
             continue
 
-        print(f'({i} / {len(os.listdir(_SEA_DATASETS_PATH))}) {dirname}')
-
-        api.create_repo(
-            f"SEACrowd/{dirname}",
-            repo_type="dataset",
-            exist_ok=True)
-
-        api.upload_file(
-            path_or_fileobj=requirements_file,
-            path_in_repo="requirements.txt",
-            repo_id=f"SEACrowd/{dirname}",
-            repo_type="dataset",
-        )
-
-        license_file = BytesIO(str.encode(
-            import_from(f"seacrowd.sea_datasets.{dirname}.{dirname}", "_LICENSE")))
-        api.upload_file(
-            path_or_fileobj=license_file,
-            path_in_repo="LICENSE",
-            repo_id=f"SEACrowd/{dirname}",
-            repo_type="dataset",
-        )
-
-        readme_file = BytesIO(str.encode(construct_readme(dirname)))
-        api.upload_file(
-            path_or_fileobj=readme_file,
-            path_in_repo="README.md",
-            repo_id=f"SEACrowd/{dirname}",
-            repo_type="dataset",
-        )
-
-        for dataloader_py_file in os.listdir(f"{_SEA_DATASETS_PATH}/{dirname}"):
-            if dataloader_py_file.endswith(".py"):
-                dataloader_file = f"{_SEA_DATASETS_PATH}/{dirname}/{dataloader_py_file}"
-                api.upload_file(
-                    path_or_fileobj=dataloader_file,
-                    path_in_repo=dataloader_py_file,
-                    repo_id=f"SEACrowd/{dirname}",
-                    repo_type="dataset",
-                )
+        try:
+            print(f'({i} / {len(os.listdir(_SEA_DATASETS_PATH))}) {dirname}')
+
+            api.create_repo(
+                f"SEACrowd/{dirname}",
+                repo_type="dataset",
+                exist_ok=True)
+
+            api.upload_file(
+                path_or_fileobj=requirements_file,
+                path_in_repo="requirements.txt",
+                repo_id=f"SEACrowd/{dirname}",
+                repo_type="dataset",
+            )
+
+            license_file = BytesIO(str.encode(
+                import_from(f"seacrowd.sea_datasets.{dirname}.{dirname}", "_LICENSE")))
+            api.upload_file(
+                path_or_fileobj=license_file,
+                path_in_repo="LICENSE",
+                repo_id=f"SEACrowd/{dirname}",
+                repo_type="dataset",
+            )
+
+            readme_file = BytesIO(str.encode(construct_readme(dirname)))
+            api.upload_file(
+                path_or_fileobj=readme_file,
+                path_in_repo="README.md",
+                repo_id=f"SEACrowd/{dirname}",
+                repo_type="dataset",
+            )
+
+            for dataloader_py_file in os.listdir(f"{_SEA_DATASETS_PATH}/{dirname}"):
+                if dataloader_py_file.endswith(".py"):
+                    dataloader_file = f"{_SEA_DATASETS_PATH}/{dirname}/{dataloader_py_file}"
+                    api.upload_file(
+                        path_or_fileobj=dataloader_file,
+                        path_in_repo=dataloader_py_file,
+                        repo_id=f"SEACrowd/{dirname}",
+                        repo_type="dataset",
+                    )
+        except Exception as e:
+            print(f"{dirname} ======= Error: {e}")
+            continue