-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
7acadb2
commit f90aa05
Showing
2 changed files
with
188 additions
and
188 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,196 +1,196 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"%%capture\n", | ||
"%pip install split-lang==1.3.4" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [ | ||
"cells": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", | ||
" from .autonotebook import tqdm as notebook_tqdm\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from split_lang import LangSplitter\n", | ||
"lang_splitter = LangSplitter()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"%%capture\n", | ||
"%pip install split-lang==1.3.5" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0|zh:你喜欢看\n", | ||
"1|ja:アニメ\n", | ||
"2|zh:吗\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"text = \"你喜欢看アニメ吗\"\n", | ||
"\n", | ||
"substr = lang_splitter.split_by_lang(\n", | ||
" text=text,\n", | ||
")\n", | ||
"for index, item in enumerate(substr):\n", | ||
" print(f\"{index}|{item.lang}:{item.text}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", | ||
" from .autonotebook import tqdm as notebook_tqdm\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from split_lang import LangSplitter\n", | ||
"lang_splitter = LangSplitter()" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0|zh:你喜欢看\n", | ||
"1|ja:アニメ\n", | ||
"2|zh:吗?我也喜欢看\n", | ||
"----------------------\n", | ||
"0|en:Please star this project on GitHub, Thanks you. I love you\n", | ||
"1|zh:请加星这个项目,谢谢你。我爱你\n", | ||
"2|ja:この項目をスターしてください、ありがとうございます!愛してる\n", | ||
"----------------------\n", | ||
"0.007998943328857422\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"lang_splitter.merge_across_punctuation=True\n", | ||
"import time\n", | ||
"texts = [\n", | ||
" \"你喜欢看アニメ吗?我也喜欢看\",\n", | ||
" \"Please star this project on GitHub, Thanks you. I love you请加星这个项目,谢谢你。我爱你この項目をスターしてください、ありがとうございます!愛してる\",\n", | ||
"]\n", | ||
"time1 = time.time()\n", | ||
"for text in texts:\n", | ||
" substr = lang_splitter.split_by_lang(\n", | ||
" text=text,\n", | ||
" )\n", | ||
" for index, item in enumerate(substr):\n", | ||
" print(f\"{index}|{item.lang}:{item.text}\")\n", | ||
" print(\"----------------------\")\n", | ||
"time2 = time.time()\n", | ||
"print(time2 - time1)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0|zh:你喜欢看\n", | ||
"1|ja:アニメ\n", | ||
"2|zh:吗\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"text = \"你喜欢看アニメ吗\"\n", | ||
"\n", | ||
"substr = lang_splitter.split_by_lang(\n", | ||
" text=text,\n", | ||
")\n", | ||
"for index, item in enumerate(substr):\n", | ||
" print(f\"{index}|{item.lang}:{item.text}\")" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0|zh:你喜欢看\n", | ||
"1|ja:アニメ\n", | ||
"2|zh:吗\n", | ||
"3|punctuation:?\n", | ||
"4|zh:我也喜欢看\n", | ||
"----------------------\n", | ||
"0|en:Please star this project on GitHub\n", | ||
"1|punctuation:, \n", | ||
"2|en:Thanks you\n", | ||
"3|punctuation:. \n", | ||
"4|en:I love you\n", | ||
"5|zh:请加星这个项目\n", | ||
"6|punctuation:,\n", | ||
"7|zh:谢谢你\n", | ||
"8|punctuation:。\n", | ||
"9|zh:我爱你\n", | ||
"10|ja:この項目をスターしてください\n", | ||
"11|punctuation:、\n", | ||
"12|ja:ありがとうございます\n", | ||
"13|punctuation:!\n", | ||
"14|ja:愛してる\n", | ||
"----------------------\n", | ||
"0.005997896194458008\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"lang_splitter.merge_across_punctuation = False\n", | ||
"time1 = time.time()\n", | ||
"for text in texts:\n", | ||
" substr = lang_splitter.split_by_lang(\n", | ||
" text=text,\n", | ||
" )\n", | ||
" for index, item in enumerate(substr):\n", | ||
" print(f\"{index}|{item.lang}:{item.text}\")\n", | ||
" print(\"----------------------\")\n", | ||
"time2 = time.time()\n", | ||
"print(time2 - time1)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0|zh:你喜欢看\n", | ||
"1|ja:アニメ\n", | ||
"2|zh:吗?我也喜欢看\n", | ||
"----------------------\n", | ||
"0|en:Please star this project on GitHub, Thanks you. I love you\n", | ||
"1|zh:请加星这个项目,谢谢你。我爱你\n", | ||
"2|ja:この項目をスターしてください、ありがとうございます!愛してる\n", | ||
"----------------------\n", | ||
"0.007998943328857422\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"lang_splitter.merge_across_punctuation=True\n", | ||
"import time\n", | ||
"texts = [\n", | ||
" \"你喜欢看アニメ吗?我也喜欢看\",\n", | ||
" \"Please star this project on GitHub, Thanks you. I love you请加星这个项目,谢谢你。我爱你この項目をスターしてください、ありがとうございます!愛してる\",\n", | ||
"]\n", | ||
"time1 = time.time()\n", | ||
"for text in texts:\n", | ||
" substr = lang_splitter.split_by_lang(\n", | ||
" text=text,\n", | ||
" )\n", | ||
" for index, item in enumerate(substr):\n", | ||
" print(f\"{index}|{item.lang}:{item.text}\")\n", | ||
" print(\"----------------------\")\n", | ||
"time2 = time.time()\n", | ||
"print(time2 - time1)" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0|zh:衬衫的价格是\n", | ||
"1|digit:9.15\n", | ||
"2|zh:便士\n" | ||
] | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0|zh:你喜欢看\n", | ||
"1|ja:アニメ\n", | ||
"2|zh:吗\n", | ||
"3|punctuation:?\n", | ||
"4|zh:我也喜欢看\n", | ||
"----------------------\n", | ||
"0|en:Please star this project on GitHub\n", | ||
"1|punctuation:, \n", | ||
"2|en:Thanks you\n", | ||
"3|punctuation:. \n", | ||
"4|en:I love you\n", | ||
"5|zh:请加星这个项目\n", | ||
"6|punctuation:,\n", | ||
"7|zh:谢谢你\n", | ||
"8|punctuation:。\n", | ||
"9|zh:我爱你\n", | ||
"10|ja:この項目をスターしてください\n", | ||
"11|punctuation:、\n", | ||
"12|ja:ありがとうございます\n", | ||
"13|punctuation:!\n", | ||
"14|ja:愛してる\n", | ||
"----------------------\n", | ||
"0.005997896194458008\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"lang_splitter.merge_across_punctuation = False\n", | ||
"time1 = time.time()\n", | ||
"for text in texts:\n", | ||
" substr = lang_splitter.split_by_lang(\n", | ||
" text=text,\n", | ||
" )\n", | ||
" for index, item in enumerate(substr):\n", | ||
" print(f\"{index}|{item.lang}:{item.text}\")\n", | ||
" print(\"----------------------\")\n", | ||
"time2 = time.time()\n", | ||
"print(time2 - time1)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0|zh:衬衫的价格是\n", | ||
"1|digit:9.15\n", | ||
"2|zh:便士\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"lang_splitter.merge_across_digit = False\n", | ||
"texts = [\n", | ||
" \"衬衫的价格是9.15便士\",\n", | ||
"]\n", | ||
"for text in texts:\n", | ||
" substr = lang_splitter.split_by_lang(\n", | ||
" text=text,\n", | ||
" )\n", | ||
" for index, item in enumerate(substr):\n", | ||
" print(f\"{index}|{item.lang}:{item.text}\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "melotts", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.14" | ||
} | ||
], | ||
"source": [ | ||
"lang_splitter.merge_across_digit = False\n", | ||
"texts = [\n", | ||
" \"衬衫的价格是9.15便士\",\n", | ||
"]\n", | ||
"for text in texts:\n", | ||
" substr = lang_splitter.split_by_lang(\n", | ||
" text=text,\n", | ||
" )\n", | ||
" for index, item in enumerate(substr):\n", | ||
" print(f\"{index}|{item.lang}:{item.text}\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "melotts", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.14" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |