-
Notifications
You must be signed in to change notification settings - Fork 0
/
string_tool.py
66 lines (50 loc) · 1.59 KB
/
string_tool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from builtins import chr
import re
import codecs
blank_regexp = re.compile(r'\s+')
punctuation = set()
with codecs.open("data/qid_answer_expand/punctuation", "r", 'utf8') as fin:
for line in fin:
punctuation.add(line.strip())
def drop_punctuation(string):
"""删除所有标点符号"""
rstring = ""
for uchar in string:
if uchar not in punctuation:
rstring += uchar
else:
rstring += " "
return rstring
def split_string(string):
split_tokens = []
for uchar in string:
split_tokens.append(uchar)
return split_tokens
def strQ2B(string):
"""全角转半角"""
rstring = ""
for uchar in string:
inside_code = ord(uchar)
if inside_code == 12288: # 全角空格直接转换
inside_code = 32
elif inside_code >= 65281 and inside_code <= 65374: # 全角字符(除空格)根据关系转化
inside_code -= 65248
rstring += chr(inside_code)
return rstring
def strB2Q(string):
"""半角转全角"""
rstring = ""
for uchar in string:
inside_code = ord(uchar)
if inside_code == 32: # 半角空格直接转化
inside_code = 12288
elif inside_code >= 32 and inside_code <= 126: # 半角字符(除空格)根据关系转化
inside_code += 65248
rstring += chr(inside_code)
return rstring
def filter_blank(string):
return blank_regexp.sub('', string)
def filter_extra_blank(string):
return blank_regexp.sub(' ', string)