version(v1.3.5): release

DoodleBears · Jul 8, 2024 · f90aa05 · f90aa05
1 parent 7acadb2
commit f90aa05
Show file tree

Hide file tree

Showing 2 changed files with 188 additions and 188 deletions.
diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@ def read(*relpath):
 
 setup(
     name="split_lang",
-    version="1.3.4",
+    version="1.3.5",
     description="A package for splitting text by languages through concatenating over split substrings based on their language",
     long_description=read("README.md"),
     long_description_content_type="text/markdown",

diff --git a/split-lang-demo.ipynb b/split-lang-demo.ipynb
@@ -1,196 +1,196 @@
 {
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%capture\n",
-    "%pip install split-lang==1.3.4"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
+  "cells": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    }
-   ],
-   "source": [
-    "from split_lang import LangSplitter\n",
-    "lang_splitter = LangSplitter()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "%%capture\n",
+        "%pip install split-lang==1.3.5"
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0|zh:你喜欢看\n",
-      "1|ja:アニメ\n",
-      "2|zh:吗\n"
-     ]
-    }
-   ],
-   "source": [
-    "text = \"你喜欢看アニメ吗\"\n",
-    "\n",
-    "substr = lang_splitter.split_by_lang(\n",
-    "    text=text,\n",
-    ")\n",
-    "for index, item in enumerate(substr):\n",
-    "    print(f\"{index}|{item.lang}:{item.text}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+            "  from .autonotebook import tqdm as notebook_tqdm\n"
+          ]
+        }
+      ],
+      "source": [
+        "from split_lang import LangSplitter\n",
+        "lang_splitter = LangSplitter()"
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0|zh:你喜欢看\n",
-      "1|ja:アニメ\n",
-      "2|zh:吗？我也喜欢看\n",
-      "----------------------\n",
-      "0|en:Please star this project on GitHub, Thanks you. I love you\n",
-      "1|zh:请加星这个项目，谢谢你。我爱你\n",
-      "2|ja:この項目をスターしてください、ありがとうございます！愛してる\n",
-      "----------------------\n",
-      "0.007998943328857422\n"
-     ]
-    }
-   ],
-   "source": [
-    "lang_splitter.merge_across_punctuation=True\n",
-    "import time\n",
-    "texts = [\n",
-    "    \"你喜欢看アニメ吗？我也喜欢看\",\n",
-    "    \"Please star this project on GitHub, Thanks you. I love you请加星这个项目，谢谢你。我爱你この項目をスターしてください、ありがとうございます！愛してる\",\n",
-    "]\n",
-    "time1 = time.time()\n",
-    "for text in texts:\n",
-    "    substr = lang_splitter.split_by_lang(\n",
-    "        text=text,\n",
-    "    )\n",
-    "    for index, item in enumerate(substr):\n",
-    "        print(f\"{index}|{item.lang}:{item.text}\")\n",
-    "    print(\"----------------------\")\n",
-    "time2 = time.time()\n",
-    "print(time2 - time1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0|zh:你喜欢看\n",
+            "1|ja:アニメ\n",
+            "2|zh:吗\n"
+          ]
+        }
+      ],
+      "source": [
+        "text = \"你喜欢看アニメ吗\"\n",
+        "\n",
+        "substr = lang_splitter.split_by_lang(\n",
+        "    text=text,\n",
+        ")\n",
+        "for index, item in enumerate(substr):\n",
+        "    print(f\"{index}|{item.lang}:{item.text}\")"
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0|zh:你喜欢看\n",
-      "1|ja:アニメ\n",
-      "2|zh:吗\n",
-      "3|punctuation:？\n",
-      "4|zh:我也喜欢看\n",
-      "----------------------\n",
-      "0|en:Please star this project on GitHub\n",
-      "1|punctuation:, \n",
-      "2|en:Thanks you\n",
-      "3|punctuation:. \n",
-      "4|en:I love you\n",
-      "5|zh:请加星这个项目\n",
-      "6|punctuation:，\n",
-      "7|zh:谢谢你\n",
-      "8|punctuation:。\n",
-      "9|zh:我爱你\n",
-      "10|ja:この項目をスターしてください\n",
-      "11|punctuation:、\n",
-      "12|ja:ありがとうございます\n",
-      "13|punctuation:！\n",
-      "14|ja:愛してる\n",
-      "----------------------\n",
-      "0.005997896194458008\n"
-     ]
-    }
-   ],
-   "source": [
-    "lang_splitter.merge_across_punctuation = False\n",
-    "time1 = time.time()\n",
-    "for text in texts:\n",
-    "    substr = lang_splitter.split_by_lang(\n",
-    "        text=text,\n",
-    "    )\n",
-    "    for index, item in enumerate(substr):\n",
-    "        print(f\"{index}|{item.lang}:{item.text}\")\n",
-    "    print(\"----------------------\")\n",
-    "time2 = time.time()\n",
-    "print(time2 - time1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0|zh:你喜欢看\n",
+            "1|ja:アニメ\n",
+            "2|zh:吗？我也喜欢看\n",
+            "----------------------\n",
+            "0|en:Please star this project on GitHub, Thanks you. I love you\n",
+            "1|zh:请加星这个项目，谢谢你。我爱你\n",
+            "2|ja:この項目をスターしてください、ありがとうございます！愛してる\n",
+            "----------------------\n",
+            "0.007998943328857422\n"
+          ]
+        }
+      ],
+      "source": [
+        "lang_splitter.merge_across_punctuation=True\n",
+        "import time\n",
+        "texts = [\n",
+        "    \"你喜欢看アニメ吗？我也喜欢看\",\n",
+        "    \"Please star this project on GitHub, Thanks you. I love you请加星这个项目，谢谢你。我爱你この項目をスターしてください、ありがとうございます！愛してる\",\n",
+        "]\n",
+        "time1 = time.time()\n",
+        "for text in texts:\n",
+        "    substr = lang_splitter.split_by_lang(\n",
+        "        text=text,\n",
+        "    )\n",
+        "    for index, item in enumerate(substr):\n",
+        "        print(f\"{index}|{item.lang}:{item.text}\")\n",
+        "    print(\"----------------------\")\n",
+        "time2 = time.time()\n",
+        "print(time2 - time1)"
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0|zh:衬衫的价格是\n",
-      "1|digit:9.15\n",
-      "2|zh:便士\n"
-     ]
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0|zh:你喜欢看\n",
+            "1|ja:アニメ\n",
+            "2|zh:吗\n",
+            "3|punctuation:？\n",
+            "4|zh:我也喜欢看\n",
+            "----------------------\n",
+            "0|en:Please star this project on GitHub\n",
+            "1|punctuation:, \n",
+            "2|en:Thanks you\n",
+            "3|punctuation:. \n",
+            "4|en:I love you\n",
+            "5|zh:请加星这个项目\n",
+            "6|punctuation:，\n",
+            "7|zh:谢谢你\n",
+            "8|punctuation:。\n",
+            "9|zh:我爱你\n",
+            "10|ja:この項目をスターしてください\n",
+            "11|punctuation:、\n",
+            "12|ja:ありがとうございます\n",
+            "13|punctuation:！\n",
+            "14|ja:愛してる\n",
+            "----------------------\n",
+            "0.005997896194458008\n"
+          ]
+        }
+      ],
+      "source": [
+        "lang_splitter.merge_across_punctuation = False\n",
+        "time1 = time.time()\n",
+        "for text in texts:\n",
+        "    substr = lang_splitter.split_by_lang(\n",
+        "        text=text,\n",
+        "    )\n",
+        "    for index, item in enumerate(substr):\n",
+        "        print(f\"{index}|{item.lang}:{item.text}\")\n",
+        "    print(\"----------------------\")\n",
+        "time2 = time.time()\n",
+        "print(time2 - time1)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0|zh:衬衫的价格是\n",
+            "1|digit:9.15\n",
+            "2|zh:便士\n"
+          ]
+        }
+      ],
+      "source": [
+        "lang_splitter.merge_across_digit = False\n",
+        "texts = [\n",
+        "    \"衬衫的价格是9.15便士\",\n",
+        "]\n",
+        "for text in texts:\n",
+        "    substr = lang_splitter.split_by_lang(\n",
+        "        text=text,\n",
+        "    )\n",
+        "    for index, item in enumerate(substr):\n",
+        "        print(f\"{index}|{item.lang}:{item.text}\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "melotts",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.14"
     }
-   ],
-   "source": [
-    "lang_splitter.merge_across_digit = False\n",
-    "texts = [\n",
-    "    \"衬衫的价格是9.15便士\",\n",
-    "]\n",
-    "for text in texts:\n",
-    "    substr = lang_splitter.split_by_lang(\n",
-    "        text=text,\n",
-    "    )\n",
-    "    for index, item in enumerate(substr):\n",
-    "        print(f\"{index}|{item.lang}:{item.text}\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "melotts",
-   "language": "python",
-   "name": "python3"
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.14"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
+  "nbformat": 4,
+  "nbformat_minor": 2
 }