-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
91 lines (74 loc) · 2.89 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""
Author: Ryan Aquino
Description: Scrape shopee.com products per category and saves it to a Postgres database
"""
import concurrent.futures
from datetime import datetime
from loguru import logger
from helpers.database import create_product_table, save_product, verify_tables
from helpers.scrape_product_details_helper import (
driver,
get_product_details,
get_product_urls,
)
def scrape_task(url: str) -> list:
"""
Scrape Job
:param url: Product Category URL
:return: List of product URL
"""
chrome_driver = driver()
products_urls = get_product_urls(chrome_driver, url)
logger.info(f"Product Count: {len(products_urls)}")
chrome_driver.quit()
return products_urls
def main() -> None:
if not verify_tables():
create_product_table()
urls = [
"https://shopee.ph/Cameras-cat.18560",
"https://shopee.ph/Gaming-cat.20718",
"https://shopee.ph/Laptops-Computers-cat.18596",
"https://shopee.ph/Home-Entertainment-cat.18529",
"https://shopee.ph/Mobiles-Gadgets-cat.24456",
"https://shopee.ph/Men's-Shoes-cat.123",
"https://shopee.ph/Mobile-Accessories-cat.109",
"https://shopee.ph/Sports-Travel-cat.1029",
"https://shopee.ph/Toys-Games-Collectibles-cat.115",
"https://shopee.ph/Women's-Shoes-cat.531",
"https://shopee.ph/Women's-Accessories-cat.106",
"https://shopee.ph/Women's-Apparel-cat.102",
]
product_list_urls = []
exception_urls = []
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
results = [executor.submit(scrape_task, url) for url in urls]
for process in concurrent.futures.as_completed(results):
product_list_urls += process.result()
products_count = len(product_list_urls)
logger.info(products_count)
with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:
product_details = [
executor.submit(get_product_details, product_url)
for product_url in product_list_urls
]
for process in concurrent.futures.as_completed(product_details):
try:
result = process.result()
if not result:
continue
print(result)
logger.info(f"{result['name']} - Processing")
save_product(result)
logger.info(f"{result['name']} - Success")
products_count -= len(exception_urls) + 1
logger.warning(f"Products exceptions: {len(exception_urls)}")
logger.info(f"Products remaining: {products_count}")
except Exception as exception_url:
exception_urls.append(str(exception_url))
with open(
f"logs/exceptions_urls_{datetime.now()}.txt", "w", encoding="utf-8"
) as file:
file.write(str(exception_urls))
if __name__ == "__main__":
main()