Skip to content

Commit

Permalink
添加两个代理 (#203)
Browse files Browse the repository at this point in the history
  • Loading branch information
WangShayne authored Dec 1, 2023
1 parent 0a6d861 commit 2344ee1
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 0 deletions.
38 changes: 38 additions & 0 deletions proxypool/crawlers/public/docip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import time
from retrying import RetryError
from loguru import logger
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import json

BASE_URL = 'https://www.docip.net/data/free.json?t={date}'



class DocipCrawler(BaseCrawler):
"""
Docip crawler, https://www.docip.net/data/free.json
"""
urls = [BASE_URL.format(date=time.strftime("%Y%m%d", time.localtime()))]

def parse(self, html):
"""
parse html file to get proxies
:return:
"""
try:
result = json.loads(html)
proxy_list = result['data']
for proxy_item in proxy_list:
host = proxy_item['ip']
port = proxy_item['port']
yield Proxy(host=host, port=port)
except json.JSONDecodeError:
print("json.JSONDecodeError")
return


if __name__ == '__main__':
crawler = DocipCrawler()
for proxy in crawler.crawl():
print(proxy)
49 changes: 49 additions & 0 deletions proxypool/crawlers/public/uqidata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from pyquery import PyQuery as pq
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
from loguru import logger

BASE_URL = 'https://ip.uqidata.com/free/index.html'


class UqidataCrawler(BaseCrawler):
"""
Uqidata crawler, https://ip.uqidata.com/free/index.html
"""
urls = [BASE_URL]
ignore = True

def encode(input_str):
tmp = []
for i in range(len(input_str)):
tmp.append("ABCDEFGHIZ".find(input_str[i]))
result = "".join(str(i) for i in tmp)
result = int(result) >> 0x03
return result

def parse(self, html):
"""
parse html file to get proxies
:return:
"""
doc = pq(html)
trs = doc('#main_container .inner table tbody tr:nth-child(n+3)').items()
for tr in trs:
ip_html = tr('td.ip').find("*").items()
host = ''
for i in ip_html:
if i.attr('style') is not None and 'none' in i.attr('style'):
continue
if i.text() == '':
continue
host += i.text()

port_code = tr('td.port').attr('class').split(' ')[1]
port = UqidataCrawler.encode(port_code)
yield Proxy(host=host, port=port)


if __name__ == '__main__':
crawler = UqidataCrawler()
for proxy in crawler.crawl():
print(proxy)

0 comments on commit 2344ee1

Please sign in to comment.