Skip to content

Commit

Permalink
version(v1.3.5): release
Browse files Browse the repository at this point in the history
  • Loading branch information
DoodleBears committed Jul 8, 2024
1 parent 7acadb2 commit f90aa05
Show file tree
Hide file tree
Showing 2 changed files with 188 additions and 188 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def read(*relpath):

setup(
name="split_lang",
version="1.3.4",
version="1.3.5",
description="A package for splitting text by languages through concatenating over split substrings based on their language",
long_description=read("README.md"),
long_description_content_type="text/markdown",
Expand Down
374 changes: 187 additions & 187 deletions split-lang-demo.ipynb
Original file line number Diff line number Diff line change
@@ -1,196 +1,196 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"%pip install split-lang==1.3.4"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
"cells": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from split_lang import LangSplitter\n",
"lang_splitter = LangSplitter()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"%pip install split-lang==1.3.5"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0|zh:你喜欢看\n",
"1|ja:アニメ\n",
"2|zh:吗\n"
]
}
],
"source": [
"text = \"你喜欢看アニメ吗\"\n",
"\n",
"substr = lang_splitter.split_by_lang(\n",
" text=text,\n",
")\n",
"for index, item in enumerate(substr):\n",
" print(f\"{index}|{item.lang}:{item.text}\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from split_lang import LangSplitter\n",
"lang_splitter = LangSplitter()"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0|zh:你喜欢看\n",
"1|ja:アニメ\n",
"2|zh:吗?我也喜欢看\n",
"----------------------\n",
"0|en:Please star this project on GitHub, Thanks you. I love you\n",
"1|zh:请加星这个项目,谢谢你。我爱你\n",
"2|ja:この項目をスターしてください、ありがとうございます!愛してる\n",
"----------------------\n",
"0.007998943328857422\n"
]
}
],
"source": [
"lang_splitter.merge_across_punctuation=True\n",
"import time\n",
"texts = [\n",
" \"你喜欢看アニメ吗?我也喜欢看\",\n",
" \"Please star this project on GitHub, Thanks you. I love you请加星这个项目,谢谢你。我爱你この項目をスターしてください、ありがとうございます!愛してる\",\n",
"]\n",
"time1 = time.time()\n",
"for text in texts:\n",
" substr = lang_splitter.split_by_lang(\n",
" text=text,\n",
" )\n",
" for index, item in enumerate(substr):\n",
" print(f\"{index}|{item.lang}:{item.text}\")\n",
" print(\"----------------------\")\n",
"time2 = time.time()\n",
"print(time2 - time1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0|zh:你喜欢看\n",
"1|ja:アニメ\n",
"2|zh:吗\n"
]
}
],
"source": [
"text = \"你喜欢看アニメ吗\"\n",
"\n",
"substr = lang_splitter.split_by_lang(\n",
" text=text,\n",
")\n",
"for index, item in enumerate(substr):\n",
" print(f\"{index}|{item.lang}:{item.text}\")"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0|zh:你喜欢看\n",
"1|ja:アニメ\n",
"2|zh:吗\n",
"3|punctuation:?\n",
"4|zh:我也喜欢看\n",
"----------------------\n",
"0|en:Please star this project on GitHub\n",
"1|punctuation:, \n",
"2|en:Thanks you\n",
"3|punctuation:. \n",
"4|en:I love you\n",
"5|zh:请加星这个项目\n",
"6|punctuation:,\n",
"7|zh:谢谢你\n",
"8|punctuation:。\n",
"9|zh:我爱你\n",
"10|ja:この項目をスターしてください\n",
"11|punctuation:、\n",
"12|ja:ありがとうございます\n",
"13|punctuation:!\n",
"14|ja:愛してる\n",
"----------------------\n",
"0.005997896194458008\n"
]
}
],
"source": [
"lang_splitter.merge_across_punctuation = False\n",
"time1 = time.time()\n",
"for text in texts:\n",
" substr = lang_splitter.split_by_lang(\n",
" text=text,\n",
" )\n",
" for index, item in enumerate(substr):\n",
" print(f\"{index}|{item.lang}:{item.text}\")\n",
" print(\"----------------------\")\n",
"time2 = time.time()\n",
"print(time2 - time1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0|zh:你喜欢看\n",
"1|ja:アニメ\n",
"2|zh:吗?我也喜欢看\n",
"----------------------\n",
"0|en:Please star this project on GitHub, Thanks you. I love you\n",
"1|zh:请加星这个项目,谢谢你。我爱你\n",
"2|ja:この項目をスターしてください、ありがとうございます!愛してる\n",
"----------------------\n",
"0.007998943328857422\n"
]
}
],
"source": [
"lang_splitter.merge_across_punctuation=True\n",
"import time\n",
"texts = [\n",
" \"你喜欢看アニメ吗?我也喜欢看\",\n",
" \"Please star this project on GitHub, Thanks you. I love you请加星这个项目,谢谢你。我爱你この項目をスターしてください、ありがとうございます!愛してる\",\n",
"]\n",
"time1 = time.time()\n",
"for text in texts:\n",
" substr = lang_splitter.split_by_lang(\n",
" text=text,\n",
" )\n",
" for index, item in enumerate(substr):\n",
" print(f\"{index}|{item.lang}:{item.text}\")\n",
" print(\"----------------------\")\n",
"time2 = time.time()\n",
"print(time2 - time1)"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0|zh:衬衫的价格是\n",
"1|digit:9.15\n",
"2|zh:便士\n"
]
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0|zh:你喜欢看\n",
"1|ja:アニメ\n",
"2|zh:吗\n",
"3|punctuation:?\n",
"4|zh:我也喜欢看\n",
"----------------------\n",
"0|en:Please star this project on GitHub\n",
"1|punctuation:, \n",
"2|en:Thanks you\n",
"3|punctuation:. \n",
"4|en:I love you\n",
"5|zh:请加星这个项目\n",
"6|punctuation:,\n",
"7|zh:谢谢你\n",
"8|punctuation:。\n",
"9|zh:我爱你\n",
"10|ja:この項目をスターしてください\n",
"11|punctuation:、\n",
"12|ja:ありがとうございます\n",
"13|punctuation:!\n",
"14|ja:愛してる\n",
"----------------------\n",
"0.005997896194458008\n"
]
}
],
"source": [
"lang_splitter.merge_across_punctuation = False\n",
"time1 = time.time()\n",
"for text in texts:\n",
" substr = lang_splitter.split_by_lang(\n",
" text=text,\n",
" )\n",
" for index, item in enumerate(substr):\n",
" print(f\"{index}|{item.lang}:{item.text}\")\n",
" print(\"----------------------\")\n",
"time2 = time.time()\n",
"print(time2 - time1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0|zh:衬衫的价格是\n",
"1|digit:9.15\n",
"2|zh:便士\n"
]
}
],
"source": [
"lang_splitter.merge_across_digit = False\n",
"texts = [\n",
" \"衬衫的价格是9.15便士\",\n",
"]\n",
"for text in texts:\n",
" substr = lang_splitter.split_by_lang(\n",
" text=text,\n",
" )\n",
" for index, item in enumerate(substr):\n",
" print(f\"{index}|{item.lang}:{item.text}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "melotts",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
],
"source": [
"lang_splitter.merge_across_digit = False\n",
"texts = [\n",
" \"衬衫的价格是9.15便士\",\n",
"]\n",
"for text in texts:\n",
" substr = lang_splitter.split_by_lang(\n",
" text=text,\n",
" )\n",
" for index, item in enumerate(substr):\n",
" print(f\"{index}|{item.lang}:{item.text}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "melotts",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit f90aa05

Please sign in to comment.