-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.py
146 lines (112 loc) · 5.58 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from PIL import Image
import csv
import pandas as pd
import math
import time
import glob
import os
# Opening Safari browser
browser = webdriver.Safari()
# Going to webiste
browser.get('https://app.leonardo.ai/auth/login')
# Login info on website
WebDriverWait(browser, 10).until(EC.presence_of_element_located(("xpath",'//*[@id="email"]')))
browser.find_element("xpath",'//*[@id="email"]').send_keys('XXXXXXXXgmail.com')
browser.find_element("xpath",'//*[@id="password"]').send_keys('xxxxxxxxx')
time.sleep(2)
browser.find_element("xpath", '//*[@id="__next"]/div/div[2]/div[2]/div[2]/div[2]/div[1]/form/div/button').submit()
browser.maximize_window()
WebDriverWait(browser, 10).until(EC.presence_of_element_located(("xpath", '//*[@id="chakra-modal-:r17:"]/footer/div/button[2]')))
for x in range(0,5):
browser.find_element("xpath", '//*[@id="chakra-modal-:r17:"]/footer/div/button[2]').click()
browser.find_element("xpath", '//*[@id="chakra-modal--body-:r18:"]/div/div[2]/div/div/div/div[3]/button').click()
time.sleep(5)
# Scrolling the page for specific amount of time to load a large amount of images to download
screen_height = browser.execute_script("return window.screen.height;")
i = 1
t_end = time.time() + 600
# time.time() < t_end:
while time.time() < t_end:
# scroll one screen height each time
browser.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(3)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = browser.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
time.sleep(2)
browser.execute_script("window.scrollTo(0,220);")
time.sleep(8)
# Go to first photo on the page
browser.find_element("xpath", '//*[@id="__next"]/div/div[2]/div/div[3]/div[2]/div/div/div[1]/div[1]/div/div/div[2]').click()
count = 1 # keep count to track number of images downloading
# opening a csv file to write images names and corresponding prompts
with open('dataset copy.csv','a') as file:
while True:
time.sleep(2)
# Error handling if element on page does not load properly
try:
WebDriverWait(browser, 10).until(EC.presence_of_element_located(("xpath", "//*[starts-with(@id, 'chakra-modal--body-:')]/div/div[1]/div[1]/div[5]/div/button[2]")))
browser.find_element("xpath", "//*[starts-with(@id, 'chakra-modal--body-:')]/div/div[1]/div[1]/div[5]/div/button[2]").click()
except:
try:
WebDriverWait(browser, 10).until(EC.presence_of_element_located(("xpath", '/html/body/div[4]/div[2]/div/div[2]/button')))
browser.find_element("xpath", '/html/body/div[4]/div[2]/div/div[2]/button').click()
continue
except:
print("elements on page did not load properly")
break
time.sleep(8) # give time for image to download
# Error handling if image does not downlad properly
try:
# Putting current downloaded image in specific folder on computer
filesList = glob.glob('/Users/poojasmac/Downloads/*.jpg') # * means all if need specific format then *.csv
recentFile = max(filesList, key=os.path.getctime)
except:
# print("file was not downloaded properly")
WebDriverWait(browser, 10).until(EC.presence_of_element_located(("xpath", '/html/body/div[4]/div[2]/div/div[2]/button')))
browser.find_element("xpath", '/html/body/div[4]/div[2]/div/div[2]/button').click()
continue
imageName = "img" + str(count) + ".jpg"
newFile = os.path.join('/Users/poojasmac/Downloads/', imageName)
os.rename(recentFile, newFile)
filesList = glob.glob('/Users/poojasmac/Downloads/*.jpg')
recentFile = max(filesList, key=os.path.getctime)
# Changing image size using ratio so image is not distorted
image = Image.open(recentFile)
imageSize = image.size
print(f"Original image size: {imageSize}")
width = imageSize[0]
height = imageSize[1]
if width <= height:
ratio = width / 400
width /= ratio
height /= ratio
else:
ratio = height / 400
height /= ratio
width /= ratio
imageResized = image.resize((math.ceil(width), math.ceil(height)))
imageResized.save(recentFile)
os.rename(recentFile, "/Users/poojasmac/Downloads/ImageDataset/" + imageName)
prompt = browser.find_element("xpath", "//*[starts-with(@id, 'chakra-modal--body-:')]/div/div[1]/div[2]/div[2]/div[1]/div/div/p").text
# Writing image name and corresponding prompt to the csv file
writer = csv.writer(file, delimiter="~")
writer.writerow([imageName, prompt])
if count == 1:
browser.find_element("xpath", '/html/body/div[4]/div[2]/div/div/button').click()
count+=1
else:
browser.find_element("xpath", '/html/body/div[4]/div[2]/div/div[2]/button').click()
count+=1
# Closing file and browser
file.close()
browser.close()