-
Notifications
You must be signed in to change notification settings - Fork 6
/
string_generator.py
147 lines (127 loc) · 4.64 KB
/
string_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import random
import re
import string
import csv
import requests
from bs4 import BeautifulSoup
from bidi import algorithm
from bidi.algorithm import get_display
import arabic_reshaper
def make_farsi_text(x):
reshaped_text = arabic_reshaper.reshape(x)
farsi_text = get_display(reshaped_text)
return farsi_text
def create_strings_from_file(filename, count, lang):
"""
Create all strings by reading lines in specified files
"""
strings = []
with open('dicts/' + filename, 'r', encoding="utf8") as f:
lines = [l.strip()[0:200] for l in f.readlines()]
if len(lines) == 0:
raise Exception("No lines could be read in file")
while len(strings) < count:
# if lang == 'fa'
if len(lines) >= count - len(strings):
if lang == "fa":
lines_fa = [make_farsi_text(l) for l in lines[0:count - len(strings)]]
strings.extend(lines_fa)
else:
strings.extend(lines[0:count - len(strings)])
else:
if lang == "fa":
lines_fa = [make_farsi_text(l) for l in lines]
strings.extend(lines_fa)
else:
strings.extend(lines)
return strings
def create_strings_from_csv(filename, count, lang):
"""
Create all strings by reading lines in specified files
"""
strings = []
with open('dicts/' + filename, 'r', encoding="utf8") as f:
reader = csv.reader(f, delimiter="\t")
for i,line in enumerate(reader):
if i < count:
tmp = ''
line = line[0].replace(',', '').split(' ')
for l in line:
tmp += l + ' '
if lang == "fa":
strings.append(make_farsi_text(tmp))
else:
strings.append(tmp)
return strings
def create_strings_from_dict(length, allow_variable, count, lang_dict, lang):
"""
Create all strings by picking X random word in the dictionnary
"""
dict_len = len(lang_dict)
strings = []
for _ in range(0, count):
current_string = ""
for _ in range(0, random.randint(1, length) if allow_variable else length):
current_string += lang_dict[random.randrange(dict_len)][:-1]
current_string += ' '
if lang == "fa":
strings.append(make_farsi_text(current_string[:-1]))
else:
strings.append(current_string[:-1])
return strings
def create_strings_from_wikipedia(minimum_length, count, lang):
"""
Create all string by randomly picking Wikipedia articles and taking sentences from them.
"""
sentences = []
while len(sentences) < count:
# We fetch a random page
page = requests.get('https://{}.wikipedia.org/wiki/Special:Random'.format(lang))
soup = BeautifulSoup(page.text, 'html.parser')
for script in soup(["script", "style"]):
script.extract()
# Only take a certain length
lines = list(filter(
lambda s:
len(s.split(' ')) > minimum_length
and not "Wikipedia" in s
and not "wikipedia" in s,
[
' '.join(re.findall(r"[\w']+", s.strip()))[0:200] for s in soup.get_text().splitlines()
]
))
# Remove the last lines that talks about contributing
sentences.extend(lines[0:max([1, len(lines) - 5])])
return sentences[0:count]
def create_strings_randomly(length, allow_variable, count, let, num, sym, lang):
"""
Create all strings by randomly sampling from a pool of characters.
"""
# If none specified, use all three
if True not in (let, num, sym):
let, num, sym = True, True, True
pool = ''
if let:
if lang == 'cn':
pool += ''.join([chr(i) for i in range(19968, 40908)]) # Unicode range of CHK characters
else:
pool += string.ascii_letters
if num:
pool += "0123456789"
if sym:
pool += "!\"#$%&'()*+,-./:;?@[\\]^_`{|}~"
if lang == 'cn':
min_seq_len = 1
max_seq_len = 2
else:
min_seq_len = 2
max_seq_len = 10
strings = []
for _ in range(0, count):
current_string = ""
for _ in range(0, random.randint(1, length) if allow_variable else length):
seq_len = random.randint(min_seq_len, max_seq_len)
current_string += ''.join([random.choice(pool) for _ in range(seq_len)])
current_string += ' '
strings.append(make_farsi_text(current_string[:-1]))
return strings